Zesheng Jia
\<B00845993>
from google.colab import drive
drive.mount('/content/drive')
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Books_5.json.gz
Hidden Cell: You may unfold this section for code details
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
# import all related libraries
import pandas as pd
import warnings
import seaborn as sns
from datetime import datetime
from datetime import timedelta
import joblib
import datetime
import math
from sklearn.base import BaseEstimator, TransformerMixin
# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# For data preprocess
import numpy as np
import csv
import os
# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
# get basemap for geographical plot
# from mpl_toolkits.basemap import Basemap
myseed = 42069 # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(myseed)
# Numerical Operations
import math
import numpy as np
# Reading/Writing Data
import pandas as pd
import os
import csv
# For Progress Bar
from tqdm import tqdm
# Pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter
# for matplotlib ploting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.model_selection import learning_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import validation_curve
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import numpy
from sklearn import metrics
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix
import nltk
# download the basic list of data and models
nltk.download('popular')
# download "book" collection of datasets from NLTK website
nltk.download("book")
from nltk.book import *
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
Hidden Cell: You may unfold this section for code details
sns.set()
# Set up with a higher resolution screen (useful on Mac)
%config InlineBackend.figure_format = 'retina'
###############################################
def max_print_out(pattern=False):
'''It will maximize print out line and set float format with .2f'''
number = None if pattern else 10
# Set options to avoid truncation when displaying a dataframe
pd.set_option("display.max_rows", number)
pd.set_option("display.max_columns", 50)
# Set floating point numbers to be displayed with 2 decimal places
pd.set_option('display.float_format', '{:.2f}'.format)
# for showing all entities
Hidden Cell: You may unfold this section for code details
################################ NEW FUNCTION IN AS3 ################################
#--------------remove_stop_words-------------
def remove_stop_words(data, stop_words):
feature = data.select_dtypes(exclude="number").columns
for i in range(len(feature)):
print("Now it's removing stop words from ", feature[i])
# remove stop words
# first change all character to lower case
data[feature[i]] = data[feature[i]].str.lower()
data[feature[i]] = data[feature[i]].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
return data
#-----------get_model_set-------------
def get_model_set(data):
# get our data set into features and labels
X = data.iloc[:,:-1]
y = data.iloc[:,-1:].values.ravel()
y = y.astype(int)
return X, y
#---------words_importance_plot---------------------
def words_importance_barplot(results, fig_size = (8,8)):
fig, ax = plt.subplots(figsize = fig_size)
results.boxplot(ax=ax)
ax.set_ylabel('Importance')
ax.set_title("Barplot of words' importance")
#--------------remove_num_non_letters-------------
def remove_num_non_letters(data):
feature = data.select_dtypes(exclude="number").columns
for i in range(len(feature)):
print("Now it's removing num_non_letters from ", feature[i])
# remove stop words
# first change all character to lower case
data[feature[i]] = data[feature[i]].str.replace('[^\w\s]+', '')
data[feature[i]] = data[feature[i]].str.replace('[0-9]+', '')
return data
# def a function to draw the Bar plot
#----------------plot_frequenct_words_bar-------------------
def plot_frequenct_words_bar(data, figsize = (15,10), name = 'style'):
fig, ax = plt.subplots(figsize = figsize)
data.plot.barh(ax = ax)
ax.set_title("Most frequent 50 words' distribution of " + str(name))
ax.set_ylabel('Counts')
# def a function to get the report
#------------------frequent_words_reports---------------------
def frequent_words_reports(data, feature = 'style'):
max_print_out(True)
frequent_words = data[feature].str.split(expand=True).stack().value_counts().head(50) # get value accounts for words
frequent_words = pd.DataFrame(frequent_words)
plot_frequenct_words_bar(frequent_words)
return frequent_words
#--------------- get_words_reports-------------
from tqdm.notebook import tqdm
def get_words_dictionary(data, column_number = 3):
# initialize dictionary
words_dictionary = {}
# loop all instances
for i in tqdm(range(len(data))):
# get text from each instances
text_array = data.iloc[i, column_number]
# get words
for text in text_array.split():
# if the word doesn't exits in dictionary then set number to 1
if words_dictionary.get(text) == None:
words_dictionary[text] = 1
# if the word alreadt exits, then add number 1
else:
words_dictionary[text] = words_dictionary.get(text) + 1
return words_dictionary
#-------------words_frequency_report--------------
def words_frequency_report(data, feature = 'reviewText',show_all=False,fig_size = (15,10)):
# get column number
column_number = data.columns.get_loc(feature)
print("Start getting word reports by ", feature)
words_dictionary = get_words_dictionary(data, column_number)
print("### Finish get the words report")
# get report
report = pd.DataFrame.from_dict(words_dictionary, orient='index')
report.columns = ['counts']
print("Load into Pandas dataFrame")
# sort report
report = report.sort_values(by=['counts'], ascending=False)
print("Sorting DataFrame")
# decide if print all
if show_all:
report_head = report
else:
# get first 50 columns
report_head = report.head(50)
print("Get report's top words\nStart plotting")
# plot setting
fig, ax = plt.subplots(figsize = fig_size)
report_head.plot.barh(ax = ax)
if show_all:
ax.set_title("Distribution of " + str(feature))
else:
ax.set_title("Most frequent 50 words' distribution of " + str(feature))
ax.set_xlabel('Counts')
ax.set_ylabel('Words')
plt.gca().invert_yaxis()
return report
from sklearn.feature_selection import SelectKBest
# ------------------feature selection-------------------------
def select_features_prompt(X_train, y_train, X_test,function):
# configure to select all features
fs = SelectKBest(score_func=function, k='all')
# learn relationship from training data
fs.fit(X_train, y_train)
# transform train input data
X_train_fs = fs.transform(X_train)
# transform test input data
X_test_fs = fs.transform(X_test)
# what are scores for the features
# print features' name and score
for i in range(len(fs.scores_)):
print(f'Feature {i} {features_name[i]}: { fs.scores_[i]}' )
return fs.scores_
#---------words_importance_plot---------------------
def words_importance_plot(results, fig_size = (15,10)):
fig, ax = plt.subplots(figsize = fig_size)
results.plot.barh(ax=ax)
plt.gca().invert_yaxis()
ax.set_ylabel('Importance')
ax.set_title("Barplot of words' importance")
#--------------text_item_properties---------------#
'''We want to save all the results to a new dataframe'''
def text_item_properties(data):
result = pd.DataFrame()
data = data.copy()
# we just fill na with 0 here. Without doing so, there will be an error
data = data.fillna('0')
for i in range(len(data.columns)):
# get character length
result['Text_length'] = data[str(data.columns[i])].str.len()
# get number of words
result['num_of_words'] = data[str(data.columns[i])].str.split().str.len()
# get non alphanumeric number
result['presence_non_alphanumeric'] = data[str(data.columns[i])].str.replace('[a-zA-Z0-9 ]', '').str.len()
# get stop words account
result['stop_words_count'] = data[str(data.columns[i])].str.split().apply(lambda x: len(set(x) & stop_words))
return result
#---------------clean_useless_information---------------
def clean_useless_information(data_df, columns = ['reviewText']):
data = data_df.copy()
for i in range(len(columns)):
# clean html tag
data[columns[i]] = data[(columns[i])].str.replace('<[^<]+?>', '')
# clean  
data[(columns[i])] = data[(columns[i])].str.replace(' ', '')
# clean http URL
data[(columns[i])] = data[(columns[i])].str.replace('http\S+', '')
# clean line breaker
data[(columns[i])] = data[(columns[i])].str.replace('\n', '')
return data
#----------------TDIDF_Data_generator---------------
def TDIDF_Data_generator(data, max_features = 500):
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# we set for max_features: as most words we saved for 500 words
# in order to not exceed our memory
v_test = TfidfVectorizer(stop_words='english', max_features = max_features)
v_style = TfidfVectorizer(stop_words='english', max_features = 8)
# get TDIDF array from token_text
x_token_text = v_test.fit_transform(data['token_text'])
# save it into a pandas dataframe
tdidf_data = pd.DataFrame(x_token_text.toarray())
# get TDIDF array from style
x_style = v_style.fit_transform(data['style'])
tdidf_data = pd.concat([tdidf_data, pd.DataFrame(x_style.toarray())], axis = 1)
tdidf_data['verified'] = data['verified']
tdidf_data['score'] = data['score']
return tdidf_data
#----------------TDIDF_Data_generator_pos---------------
def TDIDF_Data_generator_pos(data, max_features = 1000, feature_name='only_noun'):
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# we set for max_features: as most words we saved for 500 words
# in order to not exceed our memory
v_test = TfidfVectorizer(stop_words='english', max_features = max_features)
# get TDIDF array from token_text
x_token_text = v_test.fit_transform(data[feature_name])
# save it into a pandas dataframe
tdidf_data = pd.DataFrame(x_token_text.toarray(), columns = v_test.get_feature_names() )
data_copy = data.copy()
data_copy = data_copy.reset_index() # need to reset index for matching the results
tdidf_data['verified'] = data_copy['verified']
tdidf_data['score'] = data_copy['score']
return tdidf_data
#-------------find_outliers-----------------
def find_outliers(data_df, parameter,* , drop=False, set_threshold=False, threshold_value = 350): # deal with outliers
'''detect and delete outliers '''
# same with previous find_outliers function
Q1 = data_df[parameter].quantile(0.25)
Q3 = data_df[parameter].quantile(0.75)
IQR = Q3-Q1
print(f"IQR = {Q3} - {Q1} = {IQR}")
print(f"MAX = {(Q3 + 1.5 * IQR)}")
if Q1 > 1.5*IQR :
print("Min: ", (Q1 - 1.5 * IQR))
else:
print("Min is 0")
cut_out_value = (Q3 + 1.5 * IQR) # normal outliers deleted
# override the value if we set threshold
if set_threshold == True:
cut_out_value = threshold_value
# get min outliers' index
# get max outliers' index
min_outliers_df = data_df[(data_df[parameter] < (Q1 - 1.5 * IQR))]
max_outliers_df = data_df[(data_df[parameter] > cut_out_value)]
# get negtive outliers' index
negative_outliers_df = data_df[(data_df[parameter] <= 0)]
print("Num of min outliers: ", len(min_outliers_df))
print("Num of max outliers: ", len(max_outliers_df))
print("Num of negative outliers: ", len(negative_outliers_df))
print("Num of the original data set's whole instance", len(data_df))
print("Rate of purged data/total data", len(max_outliers_df)/ len(data_df))
# It's pretty hard to drop multiple indexes at the same time
# Because after one drop action, their index are changed from then
# We need to alter the order of aboving codes.
# And it's pretty unnecessary for us to do this in our assignemnt
# Since we don't have min outliers in this dataset
# And negative values are not outliers
# I decided to purge negative values in transformer instead of here
return max_outliers_df.index
#---------------clean_useless_information---------------
def show_purged_reports(data_df, parameter = ['reviewText'], output_type = 'num_of_words'):
data = data_df.copy() # get the copy
# get our reports
reports = text_item_properties( data.loc[:, parameter]);
# find outliers
index = find_outliers(reports, output_type);
# plot the results
ax = mulitple_function_plots(data=reports.drop(index), kde_type = False, plot_type="histogram",data_type="number", fig_size=(15,7),tight_layout=False)
ax = mulitple_function_plots(data=reports.drop(index), kde_type= False , plot_type="boxplot",data_type="number", fig_size=(15,7) , tight_layout=False);
return reports, index
#------------------------learning_curve------------------
def learning_curve(N, train_lc, val_lc):
# set the figure size
fig, ax = plt.subplots(figsize=(16, 6))
# get the training score
ax.plot(N, np.mean(train_lc, 1), color='blue', label='training score')
# get the validation score
ax.plot(N, np.mean(val_lc, 1), color='red', label='validation score')
# draw the grid line
ax.hlines(np.mean([train_lc[-1], val_lc[-1]]), N[0], N[-1],
color='gray', linestyle='dashed')
# graph setting up
ax.set_ylim(0.5, 1.2)
ax.set_xlim(N[0], N[-1])
ax.set_xlabel('training size')
ax.set_ylabel('Accuracy')
ax.set_title("Random forest Accuracy Train/Valid of our final model")
ax.legend(loc='best')
fig.show()
#------------------------valid_score_curve------------------
def valid_score_curve(train_score, val_score, n_estimators = np.arange(1, 50)):
fig, ax = plt.subplots(figsize=(16, 6))
# get mean of 5 cv of values
ax.plot(n_estimators, np.median(train_score, 1), color='blue', label='training score')
ax.plot(n_estimators, np.median(val_score, 1), color='red', label='validation score')
# matplot setting
ax.legend(loc='best')
ax.set_ylim(0.1, 1.2)
ax.set_xlim(0, 50)
ax.set_title("Train/Valid ACCURACY loss of different random forest models")
ax.set_xlabel('number of trees')
ax.set_ylabel('ACCURACY');
plt.show()
#---------------clean_useless_information---------------
def show_reports(data_df, parameter = ['reviewText'], output_type = 'num_of_words'):
data = data_df.copy() # get the copy
# get our reports
reports = text_item_properties( data.loc[:, parameter]);
# plot the results
ax = mulitple_function_plots(data=reports, kde_type = False, plot_type="histogram",data_type="number", fig_size=(15,7),tight_layout=False)
ax = mulitple_function_plots(data=reports, kde_type= False , plot_type="boxplot",data_type="number", fig_size=(15,7) , tight_layout=False);
return reports
################################ FUNCTION ################################
#--------------------------Describe columns----------------------------------------
def describe_columns(data, features_name=[]):
'''This function will help u print out features value counts'''
if len(features_name) > 1:
for i in range(len(features_name)):
print("----------", data[features_name[i]].name,"---------")
print(data[features_name[i]].value_counts())
else:
print("----------", data[features_name[0]].name,"---------")
print(data[features_name[0]].value_counts())
#-------------Function from tutorial 2-----------------------------
def build_continuous_features_report(data_df):
"""Build tabular report for continuous features"""
stats = {
"Count": len,
"Miss %": lambda df: df.isna().sum() / len(df) * 100,
"Card.": lambda df: df.nunique(),
"Min": lambda df: df.min(),
"1st Qrt.": lambda df: df.quantile(0.25),
"Mean": lambda df: df.mean(),
"Median": lambda df: df.median(),
"3rd Qrt": lambda df: df.quantile(0.75),
"Max": lambda df: df.max(),
"Std. Dev.": lambda df: df.std(),
}
contin_feat_names = data_df.select_dtypes("number").columns
continuous_data_df = data_df[contin_feat_names]
report_df = pd.DataFrame(index=contin_feat_names, columns=stats.keys())
for stat_name, fn in stats.items():
# NOTE: ignore warnings for empty features
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
report_df[stat_name] = fn(continuous_data_df)
return report_df
#-------------Function from tutorial 2---------------------------
def build_categorical_features_report(data_df):
"""Build tabular report for categorical features"""
def _mode(df):
return df.apply(lambda ft: ft.mode().to_list()).T
def _mode_freq(df):
return df.apply(lambda ft: ft.value_counts()[ft.mode()].sum())
def _second_mode(df):
return df.apply(lambda ft: ft[~ft.isin(ft.mode())].mode().to_list())
def _second_mode_freq(df):
return df.apply(
lambda ft: ft[~ft.isin(ft.mode())]
.value_counts()[ft[~ft.isin(ft.mode())].mode()]
.sum()
)
stats = {
"Count": len,
"Miss %": lambda df: df.isna().sum() / len(df) * 100,
"Card.": lambda df: df.nunique(),
"Mode": _mode,
"Mode Freq": _mode_freq,
"Mode %": lambda df: _mode_freq(df) / len(df) * 100,
"2nd Mode": _second_mode,
"2nd Mode Freq": _second_mode_freq,
"2nd Mode %": lambda df: _second_mode_freq(df) / len(df) * 100,
}
cat_feat_names = data_df.select_dtypes(exclude="number").columns
continuous_data_df = data_df[cat_feat_names]
report_df = pd.DataFrame(index=cat_feat_names, columns=stats.keys())
for stat_name, fn in stats.items():
# NOTE: ignore warnings for empty features
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
report_df[stat_name] = fn(continuous_data_df)
return report_df
#-------------One function to plot multiple kinds of graph ---------------------------
# All codes were written by myself
# set keyword only parameter. Since we have 3 options to plot. plot function and data type must be indentified before ploting
def mulitple_function_plots(tight_layout = True, h_space = 0.4,w_space=0.3, columns = 2, fig_size = (10,15) , kde_type = True,
*,data,plot_type="histogram",data_type="number"):
'''Plot all features from the dataset, you must specified your dataset by, data = '''
if data_type == "number":
feat_names = data.select_dtypes("number").columns
elif data_type == "categorical":
feat_names = data.select_dtypes(exclude="number").columns
# seperate those features into 2 columns
rows_number = math.ceil(len(feat_names)/columns)
print("Those features will be plotted in ", rows_number, " rows and ", columns , "columns")
# print continuous features name
print(feat_names)
#initialize figure
fig, axs = plt.subplots(rows_number, columns, figsize=fig_size)
index = 0
start = datetime.datetime.now()
#print
for i in range(rows_number):
for j in range(columns):
if index < len(feat_names):
if plot_type == 'histogram': # shortcut for histogram plot
sns.histplot(data=data, x=feat_names[index], bins = 30,kde=kde_type, ax=axs[i][j])
elif plot_type == 'boxplot': # boxplot
data.boxplot(column=feat_names[index],ax=axs[i][j], vert=False)
elif plot_type == 'barplot': # barplot
data[feat_names[index]].value_counts().plot.bar(ax=axs[i][j],rot=0);
# set corresponded name of selected features
axs[i][j].set_xlabel(feat_names[index])
# end of calculating the time
end = datetime.datetime.now()
# print info
print(index+1, ". Finish Rendering :", feat_names[index],", used",
(end - start).seconds, "millseconds")
index += 1
else:
break
#adjust pictures
plt.subplots_adjust(hspace = h_space,wspace=w_space)
# add figure title
fig.suptitle(str(plot_type.title() + " of all " + data_type.title() + " features"), fontweight ="bold")
# set whether we want to plot a tight_layout figure
if tight_layout:
fig.tight_layout()
fig.subplots_adjust(top=0.95)
return axs
#------------- draw heatmap -----------------------------------------------------
def heatmap_draw(data):
# Correlation between different variables
corr = data.corr()
# Set up the matplotlib plot configuration
f, ax = plt.subplots(figsize=(12, 10))
# Configure a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap
sns.heatmap(corr, annot=True, cmap=cmap)
plt.title("Heatmap correlation among all features")
import matplotlib.pyplot as plt
import numpy
from sklearn import metrics
############################FUNCTIONS ON PIPELINE##########################################
#--------------purge_NaN-------------------
def purge_NaN(data_df):
data = data_df.copy()
# drop vote and image
data = data.drop(['vote','image'], axis = 1)
# drop NaN values
for i in range(len(data.columns)):
data = data.drop(data[data[str(data.columns[i])].isna()].index)
return data
#---------------clean_useless_information---------------
def purge_outliers(data_df, parameter = 'reviewText', output_type = 'num_of_words'):
data = data_df.copy() # get the copy
# find outliers
result = pd.DataFrame()
result[output_type] = data[parameter].str.split().str.len()
index = find_outliers(result, output_type);
# plot the results
return data.drop(index)
################################ CLASS ################################
#------------- main transformer ---------------------
# Class for attribute transformer
# import important libray
from sklearn.base import BaseEstimator, TransformerMixin
class combined_attribute_adder_and_cleaner(BaseEstimator, TransformerMixin):
'''data clean transfomer class'''
def __init__(self, data_cleaner = True, servies_remainer = False, normalization = True): # no *args or **kargs
# we need to set extra var to ensure do we need to purge the dataset.
# In my following experments, sometimes we don't need to do so.
self.data_cleaner = data_cleaner
self.servies_remainer = servies_remainer
self.normalization = normalization
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, data_df):
# we first copy the data from our dataset.
# operate on original data set sometimes is dangerous.
X = data_df.copy()
#0. drop NaN values
# drop vote and image
X = X.drop(['vote','image'], axis = 1)
# drop NaN values
for i in range(len(X.columns)):
X = X.drop(X[X[str(X.columns[i])].isna()].index)
# 1. First we change the feature verified with to integer
X["verified"] = X["verified"].astype(int)
# 2. purge outliers
X = purge_outliers(X)
# 3. drop all useless features and categorical features we alreayd transfered
X = X.drop(['reviewerID','reviewTime', 'asin', 'unixReviewTime'],axis=1)
# 4. delete HTML tag and other useless characters
X = clean_useless_information(X)
# 5. clean alphanumeric data
X['style'] = X['style'].str.replace('Format', '')
# get text feature
feature = X.select_dtypes(exclude="number").columns
for i in range(len(feature)):
print("Now it's removing number and alphanumberic from ", feature[i])
# remove stop words
# first change all character to lower case
X[feature[i]] = X[feature[i]].str.replace('[^\w\s]+', '')
X[feature[i]] = X[feature[i]].str.replace('[0-9]+', '')
# remove stop words
stop_words = stopwords.words('english')
for i in range(len(feature)):
print("Now it's removing stop words from ", feature[i])
# remove stop words
# first change all character to lower case
X[feature[i]] = X[feature[i]].str.lower()
X[feature[i]] = X[feature[i]].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
# create new column
X['text'] = X['summary'] + " " + X['reviewText']
#6. clean style's space
X['style'] = X['style'].str.replace(' ', '')
# we put our target value at the end
target = X.pop('overall')
X['score'] = target
return X
#############################PIPE LINE###################################################
# Now we build a transformer to get all the above steps
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# convert_pipeline is for create a whole pipeline but remain the dataFrame structure
convert_pipeline = Pipeline([
('attribs_adder_cleaner', combined_attribute_adder_and_cleaner(data_cleaner=True)),
])
# ensure the random seed, that our result won't be really random
def same_seed(seed):
'''Fixes random number generator seeds for reproducibility.'''
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
torch.manual_seed(seed)
# set cuda seed
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
##############
# split dataset
def train_valid_split(data_set, valid_ratio, seed):
'''Split provided training data into training set and validation set'''
# split dataset into train, validation set by ratio
valid_set_size = int(valid_ratio * len(data_set))
# get the rest of the set
train_set_size = len(data_set) - valid_set_size
# random split the set by what we defined before
train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
# return the np array for training
return np.array(train_set), np.array(valid_set)
#### function of make the neural network prediction
def predict(test_loader, model, device):
model.eval() # Set your model to evaluation mode.
preds = []
for x in tqdm(test_loader): # use tqdm to show the progress
x = x.to(device)
with torch.no_grad(): # before do prediction, we need to turn off gradient decent
pred = model(x)
preds.append(pred.detach().cpu()) # no need to use GPU for prediction
preds = torch.cat(preds, dim=0).numpy() # concatenate the results
return preds
def plot_learning_curve(loss_record, title='', type = 'acc' , y_start = 0., y_end = 1, ylabel='Accuracy', figsize = (17,10), x_start = 0, x_end = 2000):
''' Plot learning curve of your DNN (train & dev loss) '''
accuracy_label = ['train_acc', 'valid_acc']
loss_label = ['train_loss', 'valid_loss']
if type == 'acc':
plot_selection = accuracy_label
else:
plot_selection = loss_label
x_end = len(loss_record[plot_selection[0]])
total_steps = len(loss_record[plot_selection[0]]) # get the length of our records
x_1 = range(total_steps) # get the range of x
x_2 = x_1[::len(loss_record[plot_selection[0]]) // len(loss_record['valid_acc'])]
figure(figsize=figsize) # set figsize
plt.plot(x_1, loss_record[plot_selection[0]], c='tab:red', label='train loss')
plt.plot(x_2, loss_record[plot_selection[1]], c='tab:cyan', label='validation loss')
plt.ylim(y_start, y_end) # set limit on x axis
plt.xlim(x_start, x_end) # set limit on y axis
plt.xlabel('Training steps')
plt.ylabel(ylabel)
plt.title('Learning curve of {}'.format(title))
plt.legend()
plt.show()
def plot_pred(dv_set, model, device, lim=360., preds=None, targets=None,figsize=(15,15)):
''' Plot prediction of your DNN '''
if preds is None or targets is None:
model.eval()
preds, targets = [], [] # do prediction
for x, y in dv_set:
x, y = x.to(device), y.to(device)
with torch.no_grad():
pred = model(x)
preds.append(pred.detach().cpu())
targets.append(y.detach().cpu())
preds = torch.cat(preds, dim=0).numpy() # save the prediction
targets = torch.cat(targets, dim=0).numpy() # save the target value
# matplot setting
figure(figsize = figsize)
plt.scatter(targets, preds, c='r', alpha=0.5)
plt.plot([-0.2, lim], [-0.2, lim], c='b')
plt.xlim(-0.2, lim)
plt.ylim(-0.2, lim)
plt.xlabel('ground truth value')
plt.ylabel('predicted value')
plt.title('Ground Truth v.s. Prediction')
plt.show()
# Dataset class
class Dataset_container(Dataset):
'''
x: Features.
y: Targets, if none, do prediction.
'''
def __init__(self, x, y=None):
if y is None:
self.y = y # return prediction
else:
self.y = torch.LongTensor(y) # get target value
self.x = torch.FloatTensor(x) # get features
def __getitem__(self, idx):
if self.y is None:
return self.x[idx] # get features
else:
return self.x[idx], self.y[idx] # get features and target
def __len__(self):
return len(self.x) # return length
def select_feat(train_data, valid_data, test_data, select_all=True):
'''Selects useful features to perform regression'''
# because we are operating on np arrays, we will alter the dataset and assume the last column is our target
y_train, y_valid, y_test= train_data[:,-1], valid_data[:,-1],test_data[:,-1]
# any columns before the last column is our features x
raw_x_train, raw_x_valid, raw_x_test = train_data[:,:-1], valid_data[:,:-1], test_data[:,:-1]
# Hyperparameter setting
# select all for selecting all features
if select_all:
feat_idx = list(range(raw_x_train.shape[1]))
else:
# specific the feature index
feat_idx = [0,1,2,3,4]
#return the datasets
return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid, y_test
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
'seed': 5201314, # Your seed number, you can pick your lucky number. :)
'select_all': True, # Whether to use all features.
'valid_ratio': 0.42857143, # validation_size = train_size * valid_ratio
'n_epochs': 2000, # Number of epochs. Try 2000 at first
'batch_size': 512, # since we have a quite large dataset. The batch size should be large too
'learning_rate': 1e-3,
'early_stop': 50, # If model has not improved for this many consecutive epochs, stop training.
'save_path': './models/model.ckpt' # Your model will be saved here.
}
# prepara for training data oversampling
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
def create_data_loader(model_train, model_test, oversampling = True):
# Set seed for reproducibility
same_seed(config['seed'])
test_data = model_test.values
# get training data
train_data = model_train
###################################################################################
##################Oversampling on training data only###############################
if oversampling == True:
# split to trianing set and testing set
train_data, valid_data = train_test_split(train_data, test_size = config['valid_ratio'], random_state = config['seed'])
print(f'original train_data with out oversampling size: {train_data.shape}')
ros = RandomOverSampler(random_state=0) # RandomOverSampler
X_resampled, y_resampled = ros.fit_resample(train_data.iloc[:,:-1], train_data.iloc[:,-1:]) # get the features and labels
# get oversampled train data
train_data = pd.concat([X_resampled,y_resampled], axis = 1) # concate features and labels
else:
train_data, valid_data = train_test_split(train_data, test_size = config['valid_ratio'], random_state = config['seed'])
##################End of Oversampling on training data#############################
###################################################################################
train_data = train_data.astype(float).values
valid_data = valid_data.astype(float).values
test_data = test_data.astype(float)
# fool proof for unsuccessful data preparation stage.
# Print out the data size.
print(f"""train_data size: {train_data.shape}
valid_data size: {valid_data.shape}
test_data size: {test_data.shape}""")
# Select features
x_train, x_valid, x_test, y_train, y_valid, y_test = select_feat(train_data, valid_data, test_data, config['select_all'])
# Print out the number of features.
print(f'number of features: {x_train.shape[1]}')
train_dataset, valid_dataset, test_dataset = Dataset_container(x_train, y_train), \
Dataset_container(x_valid, y_valid), \
Dataset_container(x_test,y_test)
# Pytorch data loader loads pytorch dataset into batches.
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)
return train_loader, valid_loader, test_loader
#------------------------learning_curve------------------
def learning_curve(N, train_lc, val_lc):
# set the figure size
fig, ax = plt.subplots(figsize=(16, 6))
# get the training score
ax.plot(N, np.mean(train_lc, 1), color='blue', label='training score')
# get the validation score
ax.plot(N, np.mean(val_lc, 1), color='red', label='validation score')
# draw the grid line
ax.hlines(np.mean([train_lc[-1], val_lc[-1]]), N[0], N[-1],
color='gray', linestyle='dashed')
# graph setting up
ax.set_ylim(0.5, 1.2)
ax.set_xlim(N[0], N[-1])
ax.set_xlabel('training size')
ax.set_ylabel('Accuracy')
ax.set_title("Random forest Accuracy Train/Valid of our final model")
ax.legend(loc='best')
fig.show()
#------------------------valid_score_curve------------------
def valid_score_curve(train_score, val_score, n_estimators = np.arange(1, 50)):
fig, ax = plt.subplots(figsize=(16, 6))
# get mean of 5 cv of values
ax.plot(n_estimators, np.median(train_score, 1), color='blue', label='training score')
ax.plot(n_estimators, np.median(val_score, 1), color='red', label='validation score')
# matplot setting
ax.legend(loc='best')
ax.set_ylim(0.6, 1.2)
ax.set_xlim(0, 50)
ax.set_title("Train/Valid ACCURACY loss of different random forest models")
ax.set_xlabel('number of trees')
ax.set_ylabel('ACCURACY');
plt.show()
import matplotlib.pyplot as plt
import numpy
from sklearn import metrics
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix
#------------------------draw_confusion_matrix------------------
def draw_confusion_matrix_testing(y_test, pred):
# calculate the confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [True, False])
cm_display.plot()
# matplot setting
cm_display.ax_.set_xlabel("Prediciton")
cm_display.ax_.set_ylabel("Churn")
cm_display.ax_.set_title("Confusion matrix of Churn vs prediciton")
plt.show()
#------------------------draw_normalized_confusion_matrix------------------
def draw_normalized_confusion_matrix_testing(y_true, y_pred, fig_size = (10,7)):
#initialize figure
fig, axs = plt.subplots(figsize=fig_size)
x = confusion_matrix(y_true, y_pred)
x_normed = normalize(x, axis=1, norm='l1')
sns.heatmap(x_normed, annot=True, fmt='g', ax = axs); #annot=True to annotate cells, ftm='g' to disable scientific notation
# labels, title and ticks
axs.set_xlabel("Prediction");axs.set_ylabel('Churn');
axs.set_title('Confusion Matrix of Churn vs Prediction');
axs.xaxis.set_ticklabels(['True', 'False']); axs.yaxis.set_ticklabels(['True', 'False']);
fig.tight_layout()
fig.subplots_adjust(top=0.95)
#-------------------test_scores_on_two_models-------------------
def test_scores_on_two_models(test_loader, pred_1, pred_2, function, type = 'Accuracy'):
RF_score = []
NN_score = []
for i in range(10):
RF_score.append(function(test_loader[i].dataset.y.numpy(), pred_1[i]))
NN_score.append(function(test_loader[i].dataset.y.numpy(), pred_2[i]))
results = pd.DataFrame()
results['randomForest'] = RF_score
results['NeuralNetwork'] = NN_score
figure(figsize=(8, 6), dpi=100)
ax = sns.boxplot(data=results)
plt.ylabel(str(type))
plt.title('Boxplot of '+ str(type)+ 'between Randomforest and DNN')
ks_2samp_test(results, 'randomForest', 'NeuralNetwork')
return RF_score, NN_score
from scipy.stats import ks_2samp
#-------------------ks_2samp_test-------------------
def ks_2samp_test(data, param1='randomForest', param2='NeuralNetwork'):
max_print_out(True)
value, pvalue = ks_2samp(data[param1].values,data[param2].values)
print("##################### p-value = ", pvalue, "####################")
if pvalue > 0.05:
print('Samples are likely drawn from the same distributions (fail to reject H0)')
else:
print('##################### Samples are likely drawn from different distributions (reject H0)####################')
Start up
Build the data quality report.
Identify data quality issues and build the data quality plan.
Preprocess your data according to the data quality plan.
Answer the following questions:
Since the original dataset is too larger, there is no FREE resources we can use to load the whole original dataset into memory.
Hence, we use pandas with chunk size == 2.5 millions to get a thrid chunk of the original dataset then ramdomly sample it into 1 million subset.
Why 2.5 million?
Well, my device and Kaggle notebook can not afford anymore instances in their memory.
Write a few codes to get our 2.5 millions instances from the middle of our orignial dataset.
# create a container for our subsets
subset =[]
# separate the whole dataset by chunks = 2,500,000 (2.5 millions subsets)
# we will randomly sample 1 million subsets from it
with pd.read_json('./Books_5.json.gz', lines=True, chunksize=2500000) as reader:
index = 0
for subset in reader: # get subsets by reader
print("Now, it's loading chunk ", index)
# we don't take the first 2.5 millions dataset
index += 1 # set index for skipping the first a few millions instances
if index == 3: # we take the third 2.5 million instances as our subset
break
del subset # clean memory manually
Print the subset's head
subset.head()
Convert raw dataset dtypes and print the info
subset = subset.convert_dtypes()
subset.info()
We can see that style is an object.
Before we do anything to it. We create a copy feature of it and convert it to string type
subset['style_str'] = subset['style'].astype('str')
# show value counts
subset.style_str.value_counts()
It's indeed a string type.
subset.describe()
Now, let's use random split to get 1 millioni datasets from our 2.5 million dataset.
from sklearn.model_selection import train_test_split
final_subset, _ = train_test_split(subset, test_size=0.6, random_state=42)
Print subset's info
final_subset.info()
Now, we have our self-generated data.
Let's make our dataset as same as our kaggle subset's structure.
raw_data = final_subset
raw_data = raw_data.convert_dtypes()
raw_data = raw_data.drop('Unnamed: 0', axis=1) # old index is useless now. drop it
Print our new generated data columns for more details.
raw_data.columns
raw_data.info()
we can see that vote only has 217191 instances, there are too many NaN values, we don't convert it type for now. SInce we will drop this column later.
# we don't need this self generated column anymore
raw_data = raw_data.drop('style_str',axis = 1)
raw_data.info()
# save the final version of data
raw_data.to_csv('self_generated_dataset.csv')
I compared my 1 million dataset with kaggle dataset personally. The result is fine. I won't attach those here. Since we can see in the describe() function that they are similary distribtution of overall scores.
raw_data = pd.read_csv('/content/drive/MyDrive/A3/self_generated_dataset.csv')
raw_data = raw_data.convert_dtypes()
raw_data = raw_data.drop('Unnamed: 0', axis=1) # old index is useless now. drop it
raw_data.head()
Print our self-generated data's info
raw_data.info()
We can see there are mulitple features have missing value. Vote and image has too many NaN values, we have to drop those 2 columns. The rest are style, reviewerName, reviewText, summary. There aren't too many of NaN values, we can handle it one by one.
We will handle those later.
# We set our print out line limis to maximum and set string out print format [.2f ]
max_print_out(True)
# describe continous features summary
raw_data.describe()
Reuse the function from Tutorial
#-------------Function from tutorial 2-----------------------------
def build_continuous_features_report(data_df):
"""Build tabular report for continuous features"""
stats = {
"Count": len,
"Miss %": lambda df: df.isna().sum() / len(df) * 100,
"Card.": lambda df: df.nunique(),
"Min": lambda df: df.min(),
"1st Qrt.": lambda df: df.quantile(0.25),
"Mean": lambda df: df.mean(),
"Median": lambda df: df.median(),
"3rd Qrt": lambda df: df.quantile(0.75),
"Max": lambda df: df.max(),
"Std. Dev.": lambda df: df.std(),
}
contin_feat_names = data_df.select_dtypes("number").columns
continuous_data_df = data_df[contin_feat_names]
report_df = pd.DataFrame(index=contin_feat_names, columns=stats.keys())
for stat_name, fn in stats.items():
# NOTE: ignore warnings for empty features
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
report_df[stat_name] = fn(continuous_data_df)
return report_df
max_print_out(True)
# Call function from
build_continuous_features_report(raw_data)
We can see that th cardinality of overall = 5, that is a categorical feature. Verified is also a categorical feature. unixReviewTime is acutally a time, we can convert it to time series to see what does it mean.
We draw the histogram of those 3 features anyway.
ax = mulitple_function_plots(data=raw_data.loc[:,['overall','verified' , 'unixReviewTime']], kde_type= False , plot_type="histogram",data_type="number", fig_size=(15,7))
We can see there are a lot of scores are 5. That is not a good thing.
We may need to use stratified sampling later.
ax = mulitple_function_plots(data=raw_data.loc[:,['overall','verified' , 'unixReviewTime']], kde_type= False , plot_type="boxplot",data_type="number", fig_size=(15,7))
Outliers in unixReviewTime could just be very old reviews. It doesn't matter too much.
There are a few outliers in overall. We need to investigate it further.
we will substitute the actual text items with their properties such as:
Any additional properties that you find useful in understanding text.
Here we use a property called:
Stop words count. We want to calculate how many stop words it has in every instance.
raw_data.head()
A function that get those 4 information from the original dataset.
First we get the data of nltk to get our stop words.
import nltk
# download the basic list of data and models
nltk.download('popular')
# download "book" collection of datasets from NLTK website
nltk.download("book")
from nltk.book import *
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
A function to get the reports.
This function will show:
#--------------text_item_properties---------------#
'''We want to save all the results to a new dataframe'''
def text_item_properties(data):
result = pd.DataFrame()
data = data.copy()
# we just fill na with 0 here. Without doing so, there will be an error
data = data.fillna('0')
for i in range(len(data.columns)):
# get character length
result[str(str(data.columns[i])) + 'Text_length'] = data[str(data.columns[i])].str.len()
# get number of words
result[str(str(data.columns[i])) + 'num_of_words'] = data[str(data.columns[i])].str.split().str.len()
# get non alphanumeric number
result[str(str(data.columns[i])) + 'presence_non_alphanumeric'] = data[str(data.columns[i])].str.replace('[a-zA-Z0-9 ]', '').str.len()
# get stop words account
result[str(str(data.columns[i])) + 'stop_words_count'] = data[str(data.columns[i])].str.split().apply(lambda x: len(set(x) & stop_words))
return result
We will follow the importance of those text items columns.
First we take a look at review ID.
Use our function we defined before on single column, ReviewID
reports_reviewerID = text_item_properties( raw_data.loc[:, ['reviewerID']]) ;
Save the results into pkls
# dump it into pkls
joblib.dump(reports_reviewerID, 'reports_reviewerID.pkl')
Our reports head:
reports_reviewerID.head()
Now, let's print the continous report for those 4 properties the ReviewID feature has.
build_continuous_features_report(reports_reviewerID)
I already generate the plot function in As1 and As2.
Hence, we don't write the function anymore, I put it in the Utility Function block.
Now,we just use it.
ax = mulitple_function_plots(data=reports_reviewerID,kde_type = False, plot_type="histogram",data_type="number", fig_size=(10,7))
ax = mulitple_function_plots(data=reports_reviewerID,plot_type="boxplot",data_type="number", fig_size=(15,7))
We can see we actually have outliers in Text length.
The reviewID is varied. That could be a problem. But if we don't need this feature, then it's fine.
Now, we take a look at summary.
reports_summary = text_item_properties(raw_data.loc[:, ['summary']])
# dump it into pkls
joblib.dump(reports_summary, 'reports_summary.pkl')
build_continuous_features_report(reports_summary)
Standard deviation of summary text is a little bit high. It's more than 19.26. People may write a very varied length of summarys.
ax = mulitple_function_plots(data=reports_summary,kde_type = False, plot_type="histogram",data_type="number", fig_size=(10,7))
They are Poisson distribution of all features.
The distribution is fair enough.
ax = mulitple_function_plots(data=reports_summary,plot_type="boxplot",data_type="number", fig_size=(15,7))
There are a lot of outliers in those 4 features.
People do write their summary in varied.
But we would say that most of people's summary only has a few words.
In summary, most of them are
Review Text will be our main feature. It has a lot of words in it, and we will based on this feature to predict our overall scores.
Let's take a look of its report.
reports_reviewText = text_item_properties( raw_data.loc[:, ['reviewText']]);
# dump it into pkls
joblib.dump(reports_reviewText, 'reports_reviewText.pkl')
build_continuous_features_report(reports_reviewText)
Standard deviation is very high in reviewText, we need to investigate this further more.
ax = mulitple_function_plots(data=reports_reviewText,kde_type= False, plot_type="histogram",data_type="number", fig_size=(10,7))
We have a lots of text length near to 0 and number of words near to 0. This is not a good sign. We will investigate those further.
ax = mulitple_function_plots(data=reports_reviewText, kde_type= False , plot_type="boxplot",data_type="number", fig_size=(15,7))
There is an instance that has 5800 number of words. That is a huge number of words.
And there are huge scale of outliers in the first three features.
We must investigate those in the next chapter.
reports_style = text_item_properties( raw_data.loc[:, ['style']])
# dump it into pkls
joblib.dump(reports_style, 'reports_style.pkl')
build_continuous_features_report(reports_style)
Our report shows style feature has relatively similar properties among all 4 perporties we list here.
ax = mulitple_function_plots(data=reports_style,kde_type = False, plot_type="histogram",data_type="number", fig_size=(10,7))
We can see very high non - alphanumeric characters. We need to delete those. And the num of words are varied.
And we have no stop words in this feature.
ax = mulitple_function_plots(data=reports_summary,plot_type="boxplot",data_type="number", fig_size=(15,7))
We can see that there are a lot of non alphanumeric outliers here.
And a few outliers of number of words.
We will investigate it in the next chapter.
reports_reviewerName = text_item_properties( raw_data.loc[:, ['reviewerName']])
# dump it into pkls
joblib.dump(reports_reviewerName, 'reports_reviewerName.pkl')
build_continuous_features_report(reports_reviewerName)
Similar type of features as same as style.
There is no much variation here.
ax = mulitple_function_plots(data=reports_reviewerID,kde_type = False, plot_type="histogram",data_type="number", fig_size=(10,7))
No non alphanumerica characters.
No stop words.
All instances has one word.
We may drop this feature. Since not too much helping for distinguishing our scores.
ax = mulitple_function_plots(data=reports_reviewerID,plot_type="boxplot",data_type="number", fig_size=(15,7))
Reviewer name is one of the not very important feature.
It doesn't mean too many things. We will drop it in the future. So no further investigation here.
reports_asin = text_item_properties(raw_data.loc[:, ['asin']])
# dump it into pkls
joblib.dump(reports_asin, 'reports_asin.pkl')
build_continuous_features_report(reports_asin)
No Std. Dev at all. Since asin are format number for representing books.
ax = mulitple_function_plots(data=reports_reviewerID,kde_type = False, plot_type="histogram",data_type="number", fig_size=(10,7))
Same as reviewer Name.
ax = mulitple_function_plots(data=reports_reviewerID,plot_type="boxplot",data_type="number", fig_size=(15,7))
ASIN is actually a number for identifying the book names.
Well, it's good to get the book name. But most of API to do so cost many.
And way beyond the purpose of our course.
Hence, we don't take further investigation on it.
First we take another look at the most important feature, review text and summary.
ax = mulitple_function_plots(data=reports_reviewText,kde_type= False, plot_type="histogram",data_type="number", fig_size=(10,7))
ax = mulitple_function_plots(data=reports_reviewText, kde_type= False , plot_type="boxplot",data_type="number", fig_size=(15,7))
We can see there are a lot of outliers in all our 4 text items' properties.
That is not good.
We need to investigate further about this outliers.
First we take a look at the instances with the most number of words in review Text.
reports_reviewText[reports_reviewText['reviewText_num_of_words'] > 5800]
We get the item by use .index
raw_data.loc[reports_reviewText[reports_reviewText['reviewText_num_of_words'] > 5800].index].T
We save it into a new data structure and see what's inside.
outlier_example = raw_data.loc[reports_reviewText[reports_reviewText['reviewText_num_of_words'] > 5800].head(1).index]
max_print_out(True)
outlier_example.reviewText.values
Well, by searching this instance's ASIN. We got this is a book called
Case Closed: Lee Harvey Oswald and the Assassination of JFK
It seems like that this review Text is a part from the book, instead of a real review from customers. We don't want that.
Hence, we can set a threshold to get the instances with reasonbale number of words. In case we get something like this, which is not a review at all.
Now, we reuse our find outliers function in Assignment 1 to find out how many outliers are there.
find_outliers(reports_reviewText, 'reviewText_num_of_words')
Let's take a look at what the review Text like with 300 words.
reports_reviewText[reports_reviewText['reviewText_num_of_words'] > 300].head(1)
max_print_out(True)
outlier_example = raw_data.loc[reports_reviewText[reports_reviewText['reviewText_num_of_words'] > 300].head(1).index]
outlier_example.reviewText.values
This is a valid book review. Just very long.
Let's take a look at more word's review.
max_print_out(True)
outlier_example = raw_data.loc[reports_reviewText[reports_reviewText['reviewText_num_of_words'] > 500].head(1).index]
outlier_example.reviewText.values
When words become more and more, the review text just become more and more make no sence.
We better purge all outliers by the number of words.
Before we moving furthuer, we need to know if these kind of reviews also exit in our kaggle dataset.
kaggle_data = pd.read_json('/content/drive/MyDrive/A3/sample.jsonl', lines=True)
kaggle_data = kaggle_data.convert_dtypes()
kaggle_data.info()
kaggle_reports_reviewText = text_item_properties( kaggle_data.loc[:, ['reviewText']]);
joblib.dump(kaggle_reports_reviewText,'kaggle_reports_reviewText.pkl')
ax = mulitple_function_plots(data=kaggle_reports_reviewText, kde_type= False , plot_type="boxplot",data_type="number", fig_size=(15,7))
Well, the kaggle data also has the kind of instances with thousands of words.
People are writing reading thoughts on Amazon reviews channel. That is quite interesting.
We will figure out how to handle these later.
We don't need the kaggle dataset anymore. Delete for free the memory.
del kaggle_data
There is no way around of these long reviews. For me personally, I would not read those long reviews or reading thoughts when I want to buy a book.
Hence, we better purge all of those outliers and see what's left.
Rewrite our find_outliers function to return the purge index. Since we are operating two different datasets with the same indexing system at a time.
#-------------find_outliers-----------------
def find_outliers(data_df, parameter,* , drop=False, set_threshold=False, threshold_value = 350): # deal with outliers
'''detect and delete outliers '''
# same with previous find_outliers function
Q1 = data_df[parameter].quantile(0.25)
Q3 = data_df[parameter].quantile(0.75)
IQR = Q3-Q1
print(f"IQR = {Q3} - {Q1} = {IQR}")
print(f"MAX = {(Q3 + 1.5 * IQR)}")
if Q1 > 1.5*IQR :
print("Min: ", (Q1 - 1.5 * IQR))
else:
print("Min is 0")
cut_out_value = (Q3 + 1.5 * IQR) # normal outliers deleted
# override the value if we set threshold
if set_threshold == True:
cut_out_value = threshold_value
# get min outliers' index
# get max outliers' index
min_outliers_df = data_df[(data_df[parameter] < (Q1 - 1.5 * IQR))]
max_outliers_df = data_df[(data_df[parameter] > cut_out_value)]
# get negtive outliers' index
negative_outliers_df = data_df[(data_df[parameter] <= 0)]
print("Num of min outliers: ", len(min_outliers_df))
print("Num of max outliers: ", len(max_outliers_df))
print("Num of negative outliers: ", len(negative_outliers_df))
print("Num of the original data set's whole instance", len(data_df))
print("Rate of purged data/total data", len(max_outliers_df)/ len(data_df))
# It's pretty hard to drop multiple indexes at the same time
# Because after one drop action, their index are changed from then
# We need to alter the order of aboving codes.
# And it's pretty unnecessary for us to do this in our assignemnt
# Since we don't have min outliers in this dataset
# And negative values are not outliers
# I decided to purge negative values in transformer instead of here
return max_outliers_df.index
purging_index = find_outliers(reports_reviewText, 'reviewText_num_of_words')
Max words is 251.0. And there are 108609 outliers.
Take nearly 10% of whole 1 million set.
Let's take a look at the boxplot of when we purged the outliers.
ax = mulitple_function_plots(data=reports_reviewText.drop(purging_index), kde_type= False , plot_type="boxplot",data_type="number", fig_size=(15,7))
The number of words' feature has been taken care of. The review Text length still has very large number of outliers.
We take a look at its outliers on the purged set we just got.
report_data_purged = reports_reviewText.drop(purging_index)
# get the first maximum text length instance with our purged dataset.
report_data_purged[report_data_purged['reviewText_Text_length'] > 2700].head(1)
# print the instance from the raw_data and check if's index are the same.
max_print_out(True)
outlier_example = raw_data.loc[report_data_purged[report_data_purged['reviewText_Text_length'] > 2700].head(1).index]
outlier_example
# get the value of this instance
demo_text = outlier_example.reviewText.values
demo_text
That is a superise.
There are not too many words here, but there are a lot of HTML codes in this review. We don't want that.
Let's see what we can do.
import re as re
text = re.sub('<[^<]+?>', '', str(demo_text))
text = re.sub(' ', '', str(text))
text
This is much better.
Hence, for all text items. We need to purge the HTML tags if there are any.
raw_data_copy = raw_data.copy()
raw_data_copy['reviewText'] = raw_data_copy['reviewText'].str.replace('<[^<]+?>', '')
raw_data_copy['reviewText'] = raw_data_copy['reviewText'].str.replace(' ', '')
Let's run the report function again.
reports_reviewText_purged = text_item_properties( raw_data_copy.loc[:, ['reviewText']]);
purging_index_2 = find_outliers(reports_reviewText_purged, 'reviewText_num_of_words')
ax = mulitple_function_plots(data=reports_reviewText_purged.drop(purging_index_2), kde_type= False , plot_type="boxplot",data_type="number", fig_size=(15,7))
There are still a lot of very long Text length instances.
We need to investigate it further.
report_data_purged_2 = reports_reviewText_purged.drop(purging_index_2)
report_data_purged_2[report_data_purged_2['reviewText_Text_length'] > 1750].head(1)
max_print_out(True)
outlier_example = raw_data_copy.loc[report_data_purged_2[report_data_purged_2['reviewText_Text_length'] > 1750].head(1).index]
outlier_example.reviewText.values
Now, we have website links to delete.
demo_text = str(outlier_example.reviewText.values)
import re as re
text = re.sub('http\S+', '', str(demo_text))
text
Good enough.
Now, it's time to generate a function to do so.
#---------------clean_useless_information---------------
def clean_useless_information(data_df, columns = ['reviewText']):
data = data_df.copy()
for i in range(len(columns)):
# clean html tag
data[columns[i]] = data[(columns[i])].str.replace('<[^<]+?>', '')
# clean  
data[(columns[i])] = data[(columns[i])].str.replace(' ', '')
# clean http URL
data[(columns[i])] = data[(columns[i])].str.replace('http\S+', '')
# clean line breaker
data[(columns[i])] = data[(columns[i])].str.replace('\n', '')
return data
raw_data_clean = clean_useless_information(raw_data)
Let's find out whether we did a right thing.
Original problematic instance
# original dataset with problematic instance
# HTML TAG instance
raw_data.iloc[156379,7]
Cleaned instance
raw_data_clean.iloc[156379,7]
NO HTML TAG and nbsp. GOOD.
Original problematic instance
# original dataset with problematic instance
# Long URL
raw_data.iloc[119954,7]
Cleaned instance
raw_data_clean.iloc[119954,7]
NO URL anymore. Good!
Note, such a bad idea to add feature name into result column name.
I don't have time to rerun codes from 1.a.2 Text item Properties.
So we changed our function from now on.
#--------------text_item_properties---------------#
'''We want to save all the results to a new dataframe'''
def text_item_properties(data):
result = pd.DataFrame()
data = data.copy()
# we just fill na with 0 here. Without doing so, there will be an error
data = data.fillna('0')
for i in range(len(data.columns)):
# get character length
result['Text_length'] = data[str(data.columns[i])].str.len()
# get number of words
result['num_of_words'] = data[str(data.columns[i])].str.split().str.len()
# get non alphanumeric number
result['presence_non_alphanumeric'] = data[str(data.columns[i])].str.replace('[a-zA-Z0-9]', '').str.len()
# get stop words account
result['stop_words_count'] = data[str(data.columns[i])].str.split().apply(lambda x: len(set(x) & stop_words))
return result
#---------------clean_useless_information---------------
def show_purged_reports(data_df, parameter = ['reviewText'], output_type = 'num_of_words'):
data = data_df.copy() # get the copy
# get our reports
reports = text_item_properties( data.loc[:, parameter]);
# find outliers
index = find_outliers(reports, output_type);
# plot the results
ax = mulitple_function_plots(data=reports.drop(index), kde_type = False, plot_type="histogram",data_type="number", fig_size=(15,7),tight_layout=False)
ax = mulitple_function_plots(data=reports.drop(index), kde_type= False , plot_type="boxplot",data_type="number", fig_size=(15,7) , tight_layout=False);
return reports, index
reports, index = show_purged_reports(raw_data_clean)
We still have outliers, but let's see if it's reasonable.
max_print_out(True)
outlier_example = raw_data_clean.loc[reports[reports['reviewText_Text_length'] > 1750].head(1).index]
outlier_example.reviewText.values
Quite reasonbale. Then we are done, here. No further investigation of this data quality issue.
We rerun the above codes to see if there is a problem in summary.
reports, index = show_purged_reports(raw_data_clean, parameter=['summary'])
max_print_out(True)
outlier_example = raw_data_clean.loc[reports[reports['Text_length'] > 120].head(1).index]
outlier_example.reviewText.values
Well, the longest outlier in summary is just a normal sentence.
We can called it a day for the data quality issue of text items.
raw_data_clean.info()
raw_data_clean[raw_data_clean['reviewText'].isna()].head(5)
Without reviewText, it's very hard to predict the overall score. We delete all instnace with NaN reviewText first.
#--------------purge_NaN-------------------
def purge_NaN(data_df):
data = data_df.copy()
data = data.drop(data[data['reviewText'].isna()].index)
return data
new_data_without_nan = purge_NaN(raw_data_clean)
new_data_without_nan.info()
We take a look at style's NaN looks like.
raw_data_clean[raw_data_clean['style'].isna()].head(5)
And the normal style instances
raw_data_clean['style'].head(5)
raw_data_clean['style'].value_counts().head()
len(raw_data_clean[raw_data_clean['style'].isna()])
This feature is not that important. And we can just add one kind of book style on the NaN values.
Or 5492 is really not too much instances, we can just drop it.
Let's see how many NaN with summary.
len(raw_data_clean[raw_data_clean['summary'].isna()])
raw_data_clean[raw_data_clean['summary'].isna()].head(5)
Since we have almost 1 million instances, we can just drop those 141 instances.
raw_data_clean.columns
About NaN values. We drop all instances with NaN values.
| Continuous Feature | Data Quality Issue | Potential Handling Strategies |
|---|---|---|
| overall | The ratio of 5 different scores are different | Need to stratified sample the dataset before training. |
| unixReviewTime | a time series value but in Int | convert to time series and then put into days |
| verified | boolean value | chagne to int |
| Categorical Feature | Data Quality Issue | Potential Handling Strategies |
|---|---|---|
| reviewTime | a time series value but in string | convert to time series and then put into days |
| reviewerID | Not much useful | Drop |
| asin | ID number for books | Can't get the book name so far, have to drop it |
| style | has useless non-aplhanumeric characters | need to purge those characters |
| reviewerName | not very useful | Drop the column |
| reviewText | Has outliers/html tag/URL/useless symbols | Purge outliers and delete useless information |
| summary | No problem for now | No further investigation |
| vote | TOO many NaN values | Drop column |
| image | Too many NaN values | Drop column |
#--------------purge_NaN-------------------
def purge_NaN(data_df):
data = data_df.copy()
# drop vote and image
data = data.drop(['vote','image'], axis = 1)
# drop NaN values
for i in range(len(data.columns)):
data = data.drop(data[data[str(data.columns[i])].isna()].index)
return data
raw_data_copy = raw_data.copy()
raw_data_copy = purge_NaN(raw_data_copy)
raw_data_copy.info()
We only need to use review Text to purge outliers
#---------------clean_useless_information---------------
def purge_outliers(data_df, parameter = 'reviewText', output_type = 'num_of_words'):
data = data_df.copy() # get the copy
# find outliers
result = pd.DataFrame()
result[output_type] = data[parameter].str.split().str.len()
index = find_outliers(result, output_type);
# plot the results
return data.drop(index)
raw_data_copy = purge_outliers(raw_data_copy)
raw_data_copy.info()
raw_data_copy["verified"] = raw_data_copy["verified"].astype(int)
raw_data_copy.info()
First we take a look at the style with long length
# instances with long style length
raw_data_copy[raw_data_copy['style'].str.len() > 30].head()
We can see in style, we need to remove 'Format' and Punctuation.
raw_data_copy['style'] = raw_data_copy['style'].str.replace('[^\w\s]+', '')
raw_data_copy['style'] = raw_data_copy['style'].str.replace('Format', '')
raw_data_copy.head()
Nice that is what we want. Then we delete any punctuation in other text features.
raw_data_copy['reviewerName'] = raw_data_copy['reviewerName'].str.replace('[^\w\s]+', '')
raw_data_copy['reviewText'] = raw_data_copy['reviewText'].str.replace('[^\w\s]+', '')
raw_data_copy['summary'] = raw_data_copy['summary'].str.replace('[^\w\s]+', '')
raw_data_copy.head()
Let's run our reports function to see if there is any punctuation
reports = show_reports(raw_data_copy, parameter=['summary'])
We can see there is no non aplhanumeric value anymore.
That is all we need. The useless features, we will delete in the transformer.
#------------- main transformer ---------------------
# Class for attribute transformer
# import important libray
from sklearn.base import BaseEstimator, TransformerMixin
class combined_attribute_adder_and_cleaner(BaseEstimator, TransformerMixin):
'''data clean transfomer class'''
def __init__(self, data_cleaner = True, servies_remainer = False, normalization = True): # no *args or **kargs
# we need to set extra var to ensure do we need to purge the dataset.
# In my following experments, sometimes we don't need to do so.
self.data_cleaner = data_cleaner
self.servies_remainer = servies_remainer
self.normalization = normalization
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, data_df):
# we first copy the data from our dataset.
# operate on original data set sometimes is dangerous.
X = data_df.copy()
#0. drop NaN values
# drop vote and image
X = X.drop(['vote','image'], axis = 1)
# drop NaN values
for i in range(len(X.columns)):
X = X.drop(X[X[str(X.columns[i])].isna()].index)
# 1. First we change the feature verified with to integer
X["verified"] = X["verified"].astype(int)
# 2. purge outliers
X = purge_outliers(X)
# 3. drop all useless features and categorical features we alreayd transfered
X = X.drop(['reviewerID','reviewTime', 'asin', 'unixReviewTime'],axis=1)
# 4. delete HTML tag and other useless characters
X = clean_useless_information(X)
# 5. clean non alphanumeric data
X['style'] = X['style'].str.replace('[^\w\s]+', '')
X['style'] = X['style'].str.replace('Format', '')
X['reviewerName'] = X['reviewerName'].str.replace('[^\w\s]+', '')
X['reviewText'] = X['reviewText'].str.replace('[^\w\s]+', '')
X['summary'] = X['summary'].str.replace('[^\w\s]+', '')
# we put our target value at the end
target = X.pop('overall')
X['score'] = target
return X
raw_data_copy = raw_data.copy()
attr_adder_and_cleaner = combined_attribute_adder_and_cleaner()
purged_data = attr_adder_and_cleaner.transform(raw_data_copy)
purged_data.head()
purged_data.info()
Create a pipe line.
#############################PIPE LINE###################################################
# Now we build a transformer to get all the above steps
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# convert_pipeline is for create a whole pipeline but remain the dataFrame structure
convert_pipeline = Pipeline([
('attribs_adder_cleaner', combined_attribute_adder_and_cleaner(data_cleaner=True)),
])
Test our pipeline.
converted_data = convert_pipeline.fit_transform(raw_data.copy())
converted_data.info()
Let's draw the report.
#---------------clean_useless_information---------------
def show_reports(data_df, parameter = ['reviewText'], output_type = 'num_of_words'):
data = data_df.copy() # get the copy
# get our reports
reports = text_item_properties( data.loc[:, parameter]);
# plot the results
ax = mulitple_function_plots(data=reports, kde_type = False, plot_type="histogram",data_type="number", fig_size=(15,7),tight_layout=False)
ax = mulitple_function_plots(data=reports, kde_type= False , plot_type="boxplot",data_type="number", fig_size=(15,7) , tight_layout=False);
return reports
Draw graph on reviewText
reports = show_reports(converted_data)
Draw graph on summary.
reports = show_reports(converted_data,parameter=['summary'])
Draw graph on style.
reports = show_reports(converted_data,parameter=['style'])
That is what we want.
Before we doing those questions, we need to alter our dataset a bit.
converted_data = convert_pipeline.fit_transform(raw_data.copy())
converted_data.to_csv('purged_data.csv')
converted_data.columns
converted_data.head()
converted_data.info()
converted_data = pd.read_csv('/content/drive/MyDrive/A3/purged_data.csv')
converted_data = converted_data.drop('Unnamed: 0', axis=1) # old index is useless now. drop it
Before we doing any analysis, we first need to remove the stop words.
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
len(stop_words)
We can see there are 179 stop words from nltk.corpus
# print first 5
stop_words[:20]
We will first transfer all charater to lower case and remove the stop words.
#--------------remove_stop_words-------------
def remove_stop_words(data, stop_words):
feature = data.select_dtypes(exclude="number").columns
for i in range(len(feature)):
print("Now it's removing stop words from ", feature[i])
# remove stop words
# first change all character to lower case
data[feature[i]] = data[feature[i]].str.lower()
data[feature[i]] = data[feature[i]].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
return data
converted_data_no_stop_words = remove_stop_words(converted_data,stop_words)
converted_data_copy = converted_data_no_stop_words.copy()
converted_data_no_stop_words.info()
converted_data_no_stop_words.to_csv('no_stop_words_data.csv')
# def a function to draw the Bar plot
#----------------plot_frequenct_words_bar-------------------
def plot_frequenct_words_bar(data, figsize = (15,10), name = 'style'):
fig, ax = plt.subplots(figsize = figsize)
data.plot.bar(ax = ax)
ax.set_title("Most frequent 50 words' distribution of " + str(name))
ax.set_ylabel('Counts')
# def a function to get the report
#------------------frequent_words_reports---------------------
def frequent_words_reports(data, feature = 'style'):
max_print_out(True)
frequent_words = data[feature].str.split(expand=True).stack().value_counts().head(50) # get value accounts for words
frequent_words = pd.DataFrame(frequent_words)
plot_frequenct_words_bar(frequent_words)
return frequent_words
Load our prepared no stop words datasets.
converted_data_no_stop_words = pd.read_csv('/content/drive/MyDrive/A3/no_stop_words_data.csv')
converted_data_no_stop_words = converted_data_no_stop_words.drop('Unnamed: 0', axis=1) # old index is useless now. drop it
converted_data_no_stop_words = converted_data_no_stop_words.convert_dtypes()
converted_data_no_stop_words = converted_data_no_stop_words.fillna('empty')
converted_data_no_stop_words.info()
1 Million instances are just too big.
We need to write our own data structure to get the results.
By using dictionaries.
If exits then add 1, if not then initialize with 1. Simple.
converted_data_no_stop_words.columns
#--------------- get_words_reports-------------
from tqdm.notebook import tqdm
def get_words_dictionary(data, column_number = 3):
# initialize dictionary
words_dictionary = {}
# loop all instances
for i in tqdm(range(len(data))):
# get text from each instances
text_array = data.iloc[i, column_number]
# get words
for text in text_array.split():
# if the word doesn't exits in dictionary then set number to 1
if words_dictionary.get(text) == None:
words_dictionary[text] = 1
# if the word alreadt exits, then add number 1
else:
words_dictionary[text] = words_dictionary.get(text) + 1
return words_dictionary
#-------------words_frequency_report--------------
def words_frequency_report(data, feature = 'reviewText',show_all=False,fig_size = (15,10)):
# get column number
column_number = data.columns.get_loc(feature)
print("Start getting word reports by ", feature)
words_dictionary = get_words_dictionary(data, column_number)
print("### Finish get the words report")
# get report
report = pd.DataFrame.from_dict(words_dictionary, orient='index')
report.columns = ['counts']
print("Load into Pandas dataFrame")
# sort report
report = report.sort_values(by=['counts'], ascending=False)
print("Sorting DataFrame")
# decide if print all
if show_all:
report_head = report
else:
# get first 50 columns
report_head = report.head(50)
print("Get report's top 50 words\nStart plotting")
# plot setting
fig, ax = plt.subplots(figsize = fig_size)
report_head.plot.barh(ax = ax)
if show_all:
ax.set_title("Distribution of " + str(feature))
else:
ax.set_title("Most frequent 50 words' distribution of " + str(feature))
ax.set_xlabel('Counts')
ax.set_ylabel('Words')
plt.gca().invert_yaxis()
return report
We do something unsual in style exclusively. We know that our type of book is style. And no matter how many words they have in one instance, they actually mean one type of reading materials.
Hence, in case we get something like Kindle or Edition as two words, we delete 'space' in each instances' style feature and combine all words as one.
copy = converted_data_no_stop_words.copy()
converted_data_no_stop_words = copy.copy()
# we recover the words with capitalized format
converted_data_no_stop_words['style'] = converted_data_no_stop_words['style'].str.upper()
style_frequent_words = words_frequency_report(converted_data_no_stop_words.copy(), 'style')
We can see we actually get the result of "Edition" as the second top frequent used word. But Edition actually is part of "Kindle Edition" by the most frequent type of Format.
We will handle this later in question ii by remove the space in every format instance to make those words as one word.
style_frequent_words.head(10)
reviewText_frequent_words = words_frequency_report(converted_data_no_stop_words.copy(), 'reviewText')
We can see that we still can enlarge our stop_words array.
Since the top frequent word is book, which is not really useful.
And the top 5 are book, read, one, books and great. Actually great is a good word.
We will deal this later.
reviewText_frequent_words.head(10)
reviewText_frequent_words = words_frequency_report(converted_data_no_stop_words.copy(), 'summary')
This seems like much useful. Five, stars, good, great, love, those are all good words for high score.
reviewText_frequent_words = words_frequency_report(converted_data_no_stop_words.copy(), 'reviewerName')
We reuse our words report function but this time we draw all the words we get.
But first we remove the space in every instance. To make every format as one word.
converted_data_no_stop_words['style'] = converted_data_no_stop_words['style'].str.replace(' ', '')
style_format_report = words_frequency_report(converted_data_no_stop_words, 'style', show_all=True, fig_size = (15,15))
Now, we calculate its proportion.
# First we print all formats' counts
style_format_report
style_format_proportion = style_format_report.div(style_format_report.sum()) # divide by the summation of all format
Now, draw the proportion graph.
fig,ax = plt.subplots(figsize=(10,15))
style_format_proportion.plot.barh(ax=ax)
ax.set_title("Proportion of each format", fontsize=20)
ax.set_ylabel('Proportion')
plt.xticks(fontsize=16);
plt.gca().invert_yaxis()
We can see there are almost 50% format is Kindle Edition.
Proportion data frame:
max_print_out(True)
style_format_proportion
Except the top a few formats, the other format become 0% proportion.
By the counts table, we can see some of them are only 1 or 2 instances in the dataset, they are really rare.
We print the proportion of the not very frequent use format.
fig,ax = plt.subplots(figsize=(10,15))
style_format_proportion.iloc[6:, :].plot.barh(ax=ax)
ax.set_title("Proportion of each format", fontsize=20)
ax.set_ylabel('Proportion')
plt.xticks(fontsize=16);
plt.gca().invert_yaxis()
First we replot the proportion graph.
fig,ax = plt.subplots(figsize=(10,15))
style_format_proportion.plot.barh(ax=ax)
ax.set_title("Proportion of each format", fontsize=20)
ax.set_ylabel('Proportion')
plt.xticks(fontsize=16);
plt.gca().invert_yaxis()
We can see The Most common format of the books is Kindle Edition.
style_format_proportion.head(1).index
We can see The Least common format of the books is Electronics.
Acutaully, that is not really a book format, we can say print demand is the least common book format.
style_format_proportion.tail(1).index
fig,ax = plt.subplots(figsize=(10,8))
style_format_proportion.iloc[:6,:].plot.bar(ax=ax)
ax.set_title("Proportion of each format", fontsize=20)
ax.set_ylabel('Proportion')
plt.xticks(fontsize=16);
We can see that Kindle Edition are the best seller.
Then following up with PaperBack and Hardcover. People buy paperback since paperback format is cheaper. And it's interesting to see that almost same proportion of hardcover book are sold as well.
And the next one is Mass Paperback book.
What Is a Mass Market Paperback? A mass market paperback book (MMPB), or simply mass paperback, is a mass-produced book that is typically small with thin paper covers and relatively low-quality pages to keep printing costs down. Bestsellers are often printed as mass market paperbook books for wide distribution Masterclass
Small books are also sold well.
Those four format of book are contributed almost all reviews.
We would like to see the overall score in each format book.
style_format_proportion.head(6)
def draw_plot(results):
fig, ax = plt.subplots(1,2,figsize=(15,8))
sns.boxplot(data=results, ax = ax[0])
ax[0].set_ylabel('Scores')
ax[0].set_title('Boxplot of scores')
sns.histplot(data=results, ax = ax[1])
ax[1].set_title('Historgram of scores')
kindle_book_review = converted_data_no_stop_words[converted_data_no_stop_words['style'] == 'KINDLEEDITION']
kindle_book_review.head()
print("People who read kindle give average", round(kindle_book_review['score'].mean(),2), " scores")
kindle_score = pd.DataFrame(kindle_book_review['score'])
kindle_score.columns = ['Kindle']
draw_plot(kindle_score)
paperback_review = converted_data_no_stop_words[converted_data_no_stop_words['style'] == 'PAPERBACK']
print("People who read paperback give average", round(paperback_review['score'].mean(),2), " scores")
paperback_score = pd.DataFrame(paperback_review['score'])
paperback_score.columns = ['Paperback']
draw_plot(paperback_score)
HARDBACK_review = converted_data_no_stop_words[converted_data_no_stop_words['style'] == 'HARDCOVER']
print("People who read paperback give average", round(HARDBACK_review['score'].mean(),2), " scores")
HARDBACK_score = pd.DataFrame(HARDBACK_review['score'])
HARDBACK_score.columns = ['HardCover']
draw_plot(HARDBACK_score)
HARDBACK_score.mean()
MASSMARKETPAPERBACK_review = converted_data_no_stop_words[converted_data_no_stop_words['style'] == 'MASSMARKETPAPERBACK']
print("People who read paperback give average", round(MASSMARKETPAPERBACK_review['score'].mean(),2), " scores")
MASS_paperback_score = pd.DataFrame(MASSMARKETPAPERBACK_review['score'])
MASS_paperback_score.columns = ['MASS_paperback']
draw_plot(MASS_paperback_score)
MASS_paperback_score.mean()
boardbook_review = converted_data_no_stop_words[converted_data_no_stop_words['style'] == 'BOARDBOOK']
print("People who read paperback give average", round(boardbook_review['score'].mean(),2), " scores")
boardbook_score = pd.DataFrame(boardbook_review['score'])
boardbook_score.columns = ['BOARDBOOK']
draw_plot(boardbook_score)
AudioCD_review = converted_data_no_stop_words[converted_data_no_stop_words['style'] == 'AUDIOCD']
print("People who read paperback give average", round(AudioCD_review['score'].mean(),2), " scores")
AudioCD_score = pd.DataFrame(AudioCD_review['score'])
AudioCD_score.columns = ['BOARDBOOK']
draw_plot(AudioCD_score)
We can see that except the board Book, other format's reader give the same distribution of scores.
That means the book format probably not a very important feature.
Even Board Book has the same hitorgram look with others format.
First we merge two features' data.
text_data = converted_data_no_stop_words.copy()
text_data.head()
text_data['reviewText'].head(1)
text_data['summary'].head(1)
text_data['text'] = text_data['summary'] + " " + text_data['reviewText']
text_data.head()
max_print_out(True)
text_data.head(3).text.values
Done.
We already removed stop words in previous section.
Hence, we copy the code to here.
#--------------remove_stop_words-------------
def remove_stop_words(data, stop_words):
feature = data.select_dtypes(exclude="number").columns
for i in range(len(feature)):
print("Now it's removing stop words from ", feature[i])
# remove stop words
# first change all character to lower case
data[feature[i]] = data[feature[i]].str.lower()
data[feature[i]] = data[feature[i]].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
return data
Our data set is already free of stop words.
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
text_data = remove_stop_words(text_data, stop_words)
text_data.head()
The following function is for removing non letter characters
#--------------remove_num_non_letters-------------
def remove_num_non_letters(data):
feature = data.select_dtypes(exclude="number").columns
for i in range(len(feature)):
print("Now it's removing num_non_letters from ", feature[i])
# remove stop words
# remove num_non_letters
data[feature[i]] = data[feature[i]].str.replace('[^\w\s]+', '')
data[feature[i]] = data[feature[i]].str.replace('[0-9]+', '')
return data
text_data = remove_num_non_letters(text_data)
text_data.head()
Here we choose Lemmatization, since it will remain the words' meaning.
Instead of cut into some small pieces of word but make no sence.
import nltk
from nltk.stem import PorterStemmer
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
ps = PorterStemmer()
def lemmatize_text(text):
return [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(text)]
def lemmatize_dataset(data):
tt = pd.DataFrame(data['text'])
data['token_text'] = tt.text.apply(lemmatize_text)
return data
lemmatized_text_data = lemmatize_dataset(text_data)
lemmatized_text_data.to_csv('lemmatized_text_data.csv')
# Here we delete useless features
lemmatized_text_data = lemmatized_text_data.drop(['reviewText','summary','text','reviewerName'],axis=1)
lemmatized_text_data['score'] = lemmatized_text_data.pop('score')
lemmatized_text_data.head()
Before we do anything, we first load our converted data from the previously stored csv file.
lemmatized_text_data = pd.read_csv('/content/drive/MyDrive/A3/lemmatized_text_data.csv')
lemmatized_text_data = lemmatized_text_data.drop('Unnamed: 0', axis=1) # old index is useless now. drop it
lemmatized_text_data = lemmatized_text_data.convert_dtypes()
lemmatized_text_data = lemmatized_text_data.fillna('empty')
lemmatized_text_data.info()
lemmatized_text_data.head()
No we use TDIDF on text features one by one. Since, I didn't find a solution to do all of them at once.
First we do TDIDF on our token_text that are already lemmatized.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# we set for max_features: as most words we saved for 500 words
# in order to not exceed our memory
v = TfidfVectorizer(stop_words='english', max_features = 500)
# get TDIDF array from token_text
x_token_text = v.fit_transform(lemmatized_text_data['token_text'])
Now we have our token TDIDF set with 500 columns.
x_token_text.shape
# save it into a pandas dataframe
tdidf_data = pd.DataFrame(x_token_text.toarray())
tdidf_data.head()
tdidf_data.describe()
Now we get the TDIDF array from style.
And we know that from previous section, there is only 6 types of style are mainly distribution in our dataset. Hence, we declare a new vectorizer and set our max_features = 8
# now we get TDIDF array from style
v = TfidfVectorizer(stop_words='english', max_features = 8)
# get TDIDF array from style
x_style = v.fit_transform(lemmatized_text_data['style'])
x_style.shape
x_style.toarray()
Now we combine then together.
tdidf_data = pd.concat([tdidf_data, pd.DataFrame(x_style.toarray())], axis = 1)
tdidf_data.head()
Now we put the verified column and our target value back to this dataset.
tdidf_data['verified'] = lemmatized_text_data['verified']
tdidf_data['score'] = lemmatized_text_data['score']
tdidf_data.head()
Note, we detached the rest of features like reviewName, which isn't that important. And the review text and summary, since they both are already in the token_text. We don't need those anymore.
Since, this result is too big, we won't save it into file again. Not write a function to generate this result.
#----------------TDIDF_Data_generator---------------
def TDIDF_Data_generator(data, max_features = 500):
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# we set for max_features: as most words we saved for 500 words
# in order to not exceed our memory
v_test = TfidfVectorizer(stop_words='english', max_features = max_features)
v_style = TfidfVectorizer(stop_words='english', max_features = 8)
# get TDIDF array from token_text
x_token_text = v_test.fit_transform(data['token_text'])
# save it into a pandas dataframe
tdidf_data = pd.DataFrame(x_token_text.toarray(), columns = v_test.get_feature_names() )
# get TDIDF array from style
x_style = v_style.fit_transform(data['style'])
tdidf_data = pd.concat([tdidf_data, pd.DataFrame(x_style.toarray(), columns = v_style.get_feature_names() )], axis = 1)
data_copy = data.copy()
data_copy = data_copy.reset_index() # need to reset index for matching the results
tdidf_data['verified'] = data_copy['verified']
tdidf_data['score'] = data_copy['score']
return tdidf_data
We will use this function in the next chapter. Since there is no need to show the results twice.
supervised x unsupervised, classification x regression x clustering or similarity matching x etc).
We are solving a superviesed multi-labeled classification task.
lemmatized_text_data = pd.read_csv('/content/drive/MyDrive/A3/lemmatized_text_data.csv')
lemmatized_text_data = lemmatized_text_data.drop('Unnamed: 0', axis=1) # old index is useless now. drop it
lemmatized_text_data = lemmatized_text_data.convert_dtypes()
lemmatized_text_data = lemmatized_text_data.fillna('empty')
lemmatized_text_data.info()
Now, we split the dataset first for more ability to find good features.
we take only 10% of data to do the feature selection. As our 100K dataset.
from sklearn.model_selection import train_test_split
model_data_raw, _ = train_test_split(lemmatized_text_data, test_size=0.88695, random_state=42)
# we drop reviewText and reivewName, summary and text columns, since we already have token_text and review Name is useless.
model_data_raw = model_data_raw.drop(['reviewerName', 'reviewText', 'summary', 'text'], axis = 1)
# rearange the target label feature score to the end
model_data_raw['score'] = model_data_raw.pop('score')
model_data_raw.head()
model_data_raw.info()
That's the final version we need.
model_data = TDIDF_Data_generator(model_data_raw)
model_data = model_data.fillna(0) # in case of losing values during the period of generating TDIDF array
model_data.head()
Split our model_data into train and test set.
from sklearn.model_selection import train_test_split
training_data, test_data = train_test_split(model_data, test_size=0.3, random_state=42)
# get our data set into features and labels
X_train = training_data.iloc[:,:-1]
y_train = training_data.iloc[:,-1:].values.ravel()
y_train = y_train.astype(int)
X_test = test_data.iloc[:,:-1]
First we save the feature name.
features_name = X_train.loc[:,:].columns.tolist();
len(features_name)
Reuse our feature selection function from Assignment 2.
from sklearn.feature_selection import SelectKBest
# feature selection
def select_features_prompt(X_train, y_train, X_test,function):
# configure to select all features
fs = SelectKBest(score_func=function, k='all')
# learn relationship from training data
fs.fit(X_train, y_train)
# transform train input data
X_train_fs = fs.transform(X_train)
# transform test input data
X_test_fs = fs.transform(X_test)
# what are scores for the features
# print features' name and score
for i in range(len(fs.scores_)):
print(f'Feature {i} {features_name[i]}: { fs.scores_[i]}' )
return fs.scores_
Get all features importances as following:
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
fscores = select_features_prompt(X_train, y_train,X_test, f_classif)
Save the results into a dataframe.
results_df = pd.DataFrame(fscores, index=features_name , columns = ['importance'])
results_df.head()
We can see the original results are not sorted.
Now we sort those values by descending.
results_df = results_df.sort_values(by=['importance'],ascending=False)
results_df.head(10)
It's clearly to see that waste, bore, love, great, ok, money and etc are most import features in our TFIDF vector space. Now, we plot those as bar plot.
#---------words_importance_plot---------------------
def words_importance_plot(results, fig_size = (15,10)):
fig, ax = plt.subplots(figsize = fig_size)
results.plot.barh(ax=ax)
plt.gca().invert_yaxis()
ax.set_ylabel('Importance')
ax.set_title("Barplot of words' importance")
First we take a look at the bar plot of first 50 importance words.
words_importance_plot(results_df.head(50))
We can see that there could be more importance words.
Let's draw the bar plot of first 100 words.
words_importance_plot(results_df.head(100),fig_size=(15,18))
250 words:
words_importance_plot(results_df.head(250), fig_size = (15,30))
Now, we can see that after a certain number of words, the importances of all words after that are not that important anymore.
Now we plot the boxplot of the result dataframe.
#---------words_importance_plot---------------------
def words_importance_barplot(results, fig_size = (8,8)):
fig, ax = plt.subplots(figsize = fig_size)
results.boxplot(ax=ax)
ax.set_ylabel('Importance')
ax.set_title("Barplot of words' importance")
words_importance_barplot(results_df)
build_continuous_features_report(results_df)
We can see that the mean importance is only 32.23, and max is 1064. We don't want those features with low importance.
find_outliers(results_df,'importance')
Let's redraw the plot with any importance value larger than 51. By our find outliers function we defined before.
words_importance_barplot(results_df[results_df['importance'] > 51])
Let's see how many of those words that importance > 51.
len(results_df[results_df['importance'] > 51])
Only 67 features. Let's redraw the bar plot.
words_importance_plot((results_df[results_df['importance'] > 51]),fig_size=(15,15))
Intuitively, we can see that only a few words at front is important.
However, it doesn't mean everyone would use these words in their reviewText. If they didn't use any of words at top 10 or top 15, we have no clue what their attitude of the product is.
Hence, we must leave the most important features with some redundancy。
In the end, we will take those 67 features all, then create a new dataset.
results_df[results_df['importance'] > 51].index
model_data.head()
Now, we extract all those 67 features with our target label : score.
reduced_model_data = model_data.loc[:, results_df[results_df['importance'] > 51].index]
reduced_model_data['score'] = model_data['score']
reduced_model_data.head(5)
That's what we need for our training process.
And let's save it to a new csv file. We will use this file since now.
reduced_model_data.to_csv('reduced_model_data.csv')
reduced_model_data.info()
reduced_model_data = pd.read_csv('/content/drive/MyDrive/A3/reduced_model_data.csv')
reduced_model_data = reduced_model_data.drop('Unnamed: 0', axis=1) # old index is useless now. drop it
reduced_model_data = reduced_model_data.convert_dtypes()
reduced_model_data.info()
We can see that our csv can remaint the same type of our data.
Then that's the end of our feature selection.
In this assignment, we will still mainly use accuracy as our main metric. Since we have a classification task to solve.
And instead of just using accuracy, we will still save the prediction and use sklearn's classification_report to show all the details about each feature's percision score, recall score and f1 score at the end.
But we will not plot those scores in section 3.
Now we start to split the train and test set from our new feature selected dataset.
reduced_model_data = pd.read_csv('/content/drive/MyDrive/A3/reduced_model_data.csv')
reduced_model_data = reduced_model_data.drop('Unnamed: 0', axis=1) # old index is useless now. drop it
reduced_model_data = reduced_model_data.convert_dtypes()
reduced_model_data.head()
Now we separate our dataset as training set, validation set and test set.
#-----------get_model_set-------------
def get_model_set(data):
# get our data set into features and labels
X = data.iloc[:,:-1]
y = data.iloc[:,-1:].values.ravel()
y = y.astype(int)
return X, y
We get our data as X and y as label.
We will use stratified sampling later.
X_raw, y_raw = get_model_set(reduced_model_data)
y_raw
Split our data by stratified sampling by using stratify = y
And we split our set into training, validation and test set.
X_train,X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.2, random_state=42, stratify=y_raw)
X_train,X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
Test train for once
from sklearn.ensemble import RandomForestClassifier
forest_cls = RandomForestClassifier(n_estimators = 50, random_state=42,verbose=1)
forest_cls.fit(X_train, y_train)
Now we predict our values on validation set.
from sklearn.metrics import f1_score
churn_prediction = forest_cls.predict(X_valid)
from sklearn.metrics import classification_report
target_names = ['1', '2','3','4','5']
print("Classification report of the first classifier:\n\n",
classification_report(y_valid, churn_prediction, target_names=target_names))
Results are not quite good.
Let's use random Search directly this time.
So we make our max_feature in random search = $\sqrt{67} = 8.18 $. But we will truncate 8.18 to 8, since this is what R will do by default.
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distribs = {
'n_estimators': randint(low=1, high = 100),
'max_features': randint(low=1, high = 8),
}
forest_cls_rs = RandomForestClassifier(random_state=42, verbose = 1)
rnd_search = RandomizedSearchCV(forest_cls_rs, param_distributions=param_distribs,
n_iter=10, cv=5, random_state=42)
rnd_search.fit(X_train, y_train)
rnd_search.best_params_
joblib.dump(rnd_search, 'rnd_search.pkl')
Best estimator of our random search is:
rnd_search.best_estimator_
Print the list of we tried and sorted the order
cvres_rnd = rnd_search.cv_results_
for mean_score, params in sorted(zip(cvres_rnd["mean_test_score"], cvres_rnd["params"]), reverse=True):
print(mean_score, params)
We can see that our best estimator is using RSV: bootstrap with 3 random features among all trees and use totally 72 trees.
The work is not done by one day, so we reload our dataset and divide into train,val and test set.
reduced_model_data = pd.read_csv('/content/drive/MyDrive/A3/reduced_model_data.csv')
reduced_model_data = reduced_model_data.drop('Unnamed: 0', axis=1) # old index is useless now. drop it
reduced_model_data = reduced_model_data.convert_dtypes()
X_raw, y_raw = get_model_set(reduced_model_data)
X_train,X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.2, random_state=42, stratify=y_raw)
X_train,X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
Now we can train our model with above hyperparamters and test it on validation set.
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Train our model
forest_cls_final = RandomForestClassifier(max_features=3, n_estimators=72, random_state=42)
# train our model
forest_cls_final.fit(X_train, y_train)
pred_final_valid = forest_cls_final.predict(X_valid)
target_names = ['1', '2','3','4','5']
print("Classification report of the final classifier on validation set:\n\n",
classification_report(y_valid, pred_final_valid, target_names=target_names))
Cross validation on the training set.
from sklearn.model_selection import cross_val_score
original_result = cross_val_score(forest_cls_final, X_train, y_train, cv=10)
original_result
We will use this result in Q4 for comparing with Q4's result.
Test it on test set.
# test it on test set
prediction_final_test = forest_cls_final.predict(X_test)
print("Classification report of the final classifier on validation set:\n\n",
classification_report(y_test, prediction_final_test, target_names=target_names))
The best accuracy that I can get with this dataset is 0.62.
We can keep increasing the number of features we have. But we will not do that yet. Since we will do part of speech tagging in section 4.
We will keep going from there.
And We can draw the learning curve and validation curve of our model to see whether the training score and validation score has departed.
For learning curve:
Since we are using the accuracy evaluation metrics, hence, if the valiation score become very larger and training score still small, then we know our model is overfitted.
We can based on our results to choose our train set size or number of trees in our model.
For validation curve: Although randomforest algroithm is not very easy to overfitted. We still can tune the hyperparameters to make sure it will not overfit. By looking into the validation curve, if along with the grow of number of trees, the train/loss curve has departed from each other. We know that our model become overfitted at that number of trees.
Use these two tools, we can make sure our model will not overfit our data.
A learning curve is a graphical representation of the relationship between how proficient people are at a task and the amount of experience they have. Proficiency (measured on the vertical axis) usually increases with increased experience (the horizontal axis), that is to say, the more someone, groups, companies or industries perform a task, the better their performance at the task.Wikipedia
We can find out from the LR curve that whether our model is overfitting/underfitting or whether our model could still imporve by more training epoches. It will help identify our model is good or bad.
set the model as our best estimator and finding the learning curve with scoring = accuracy. Sklearn document
from sklearn.model_selection import learning_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import validation_curve
# set the model as our best estimator and finding the learning curve with scoring = accuracy
N, train_lc, val_lc = learning_curve(RandomForestClassifier(max_features=3, n_estimators=72, random_state=42),
X_train, y_train, cv=5,scoring='accuracy',
train_sizes=np.linspace(0.05, 1, 20)) # separate our training size by np.linspace, to 20 pieces
joblib.dump(N,'N.pkl')
joblib.dump(train_lc,'train_lc.pkl')
joblib.dump(val_lc,'val_lc.pkl')
N = joblib.load('/content/drive/MyDrive/A3/pkls/N.pkl')
train_lc = joblib.load('/content/drive/MyDrive/A3/pkls/train_lc.pkl')
val_lc = joblib.load('/content/drive/MyDrive/A3/pkls/val_lc.pkl')
Now we define a function to plot the learning curve.
#------------------------learning_curve------------------
def learning_curve(N, train_lc, val_lc):
# set the figure size
fig, ax = plt.subplots(figsize=(16, 6))
# get the training score
ax.plot(N, np.mean(train_lc, 1), color='blue', label='training score')
# get the validation score
ax.plot(N, np.mean(val_lc, 1), color='red', label='validation score')
# draw the grid line
ax.hlines(np.mean([train_lc[-1], val_lc[-1]]), N[0], N[-1],
color='gray', linestyle='dashed')
# graph setting up
ax.set_ylim(0.5, 1.2)
ax.set_xlim(N[0], N[-1])
ax.set_xlabel('training size')
ax.set_ylabel('Accuracy')
ax.set_title("Random forest Accuracy Train/Valid of our final model")
ax.legend(loc='best')
fig.show()
learning_curve(N, train_lc, val_lc)
Although we used random search and grid search for finding the most proper hyperparameters, we still want to know that how many trees is good enough and how our model is influnenced by the different number of trees. Like what we did in polynomial regression with different degrees.
In this kind of visualization, we can give a intuitive way of showing the chaning of train/valid score by trees number.
This can help up to use a relatively good enough number of trees when we test our model on different dataset. Since if we best estimator only has 22 trees, we set our limit to 50 trees in this plot.
from sklearn.model_selection import validation_curve
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import numpy as np
n_estimators = np.arange(1, 50) # limit of number of estimators
train_score, val_score = validation_curve(forest_cls_final, X_train, y_train,
param_name='n_estimators', param_range=n_estimators, cv=2
, scoring = 'accuracy')
joblib.dump(train_score,'est_train_score.pkl')
joblib.dump(val_score,'est_val_score.pkl')
train_score = joblib.load('/content/drive/MyDrive/A3/pkls/est_train_score.pkl')
val_score = joblib.load('/content/drive/MyDrive/A3/pkls/est_val_score.pkl')
#------------------------valid_score_curve------------------
def valid_score_curve(train_score, val_score, n_estimators = np.arange(1, 50)):
fig, ax = plt.subplots(figsize=(16, 6))
# get mean of 5 cv of values
ax.plot(n_estimators, np.median(train_score, 1), color='blue', label='training score')
ax.plot(n_estimators, np.median(val_score, 1), color='red', label='validation score')
# matplot setting
ax.legend(loc='best')
ax.set_ylim(0.1, 1.2)
ax.set_xlim(0, 50)
ax.set_title("Train/Valid ACCURACY loss of different random forest models")
ax.set_xlabel('number of trees')
ax.set_ylabel('ACCURACY');
plt.show()
# draw the validation curve
valid_score_curve(train_score, val_score, n_estimators = np.arange(1, 50))
Precision-Recall is a useful measure of success of prediction when the classes are very imbalanced. In information retrieval, precision is a measure of result relevancy, while recall is a measure of how many truly relevant results are returned.
The precision-recall curve shows the tradeoff between precision and recall for different threshold. A high area under the curve represents both high recall and high precision, where high precision relates to a low false positive rate, and high recall relates to a low false negative rate. High scores for both show that the classifier is returning accurate results (high precision), as well as returning a majority of all positive results (high recall). sklearn
The function is coming from scikitplot pacakge
This function will draw ROC and percision recall curve
import matplotlib.pyplot as plt
import scikitplot as skplt
#------------draw_roc_curve--------
# this function will draw ROC and percision recall curve
def draw_roc_or_percision_recall_curve(model,y_test, X_test, type = 'roc'):
predicted_probas = model.predict_proba(X_test) # get results
fig, ax = plt.subplots(figsize = (10,10))
if type == 'roc':
skplt.metrics.plot_roc(y_test, predicted_probas, ax= ax) # draw ROC curve
else: # draw precision_recall curve
skplt.metrics.plot_precision_recall_curve(y_test, predicted_probas,ax=ax)
plt.show()
return
ROC Curve
# draw ROC curve
draw_roc_or_percision_recall_curve(forest_cls_final, y_test, X_test, type='roc')
Percision Recall curve
# draw Percision Recall curve
draw_roc_or_percision_recall_curve(forest_cls_final, y_test, X_test, type='pr_c')
This time, we first we take a look at our validation curve. Since our accuracy is too low, we would like to know if there is a overfitting by using too many trees.
# draw the validation curve
valid_score_curve(train_score, val_score, n_estimators = np.arange(1, 50))
However, in the validation curve, we can see that our model won't improve after more than 5 trees. Hence the choice of our best estimators won't be a problem of overfitting by large estimators' number. And random forest is famous by not easy to get overfitting by increasing estimators.
However, if our model can fit this data with those few trees. It means our data set has too few variability. We are impossible to find more patterns in the dataset. That is a bad sign.
learning_curve(N, train_lc, val_lc)
And the learning curve confirmed our guessing.
We can see that our model is extremely easy to get overfitted in training model. And it gets near 100% with a very small training size.
That means the words used in validation set doesn't appear in training set. I tried to use all 500 words or 5000 words to train the model, but the result is similar. Since most of words doesn't appear in most of reviewText, it's impossible for our model to asign a correct coef to each parameteres.
Hence, we need to get our words feature as a small group and most importantly they must be important words.
Let's redraw the graph.
# draw ROC curve
draw_roc_or_percision_recall_curve(forest_cls_final, y_test, X_test, type='roc')
We can see the Area under curve as AUC in our dataset is similar. But class 4 has lowest AUC in our ROC graph.
We would say that class 4 has the worst performance in our prediciton by ROC curve.
print("Classification report of the final classifier on validation set:\n\n",
classification_report(y_test, prediction_final_test, target_names=target_names))
We use our previous defined function to draw this graph.
# draw ROC curve
draw_roc_or_percision_recall_curve(forest_cls_final, y_test, X_test, type='pr_c')
We can see that except class 5 has a high percision score when recall score increases. Other four classes has a siginificant low percision scores. There is a chance that people who rate the product in lowe scores won't use similar words to express their feelings.
We would see in the original dataset, the prediciton is good on score 5's class. But not good at other classes.
The following is the way of how to calculate all kinds of scores. Wiki
In this task, we need to redefine a few things in our transformer and pipelines for more easier to get our dataset separately. But it's not necessary to write those explictly.
Hence, we put those functions in this Utility Function section for more convenient to use.
#------------- main transformer ---------------------
# Class for attribute transformer
# import important libray
from sklearn.base import BaseEstimator, TransformerMixin
class combined_attribute_adder_and_cleaner(BaseEstimator, TransformerMixin):
'''data clean transfomer class'''
def __init__(self, data_cleaner = True, servies_remainer = False, normalization = True): # no *args or **kargs
# we need to set extra var to ensure do we need to purge the dataset.
# In my following experments, sometimes we don't need to do so.
self.data_cleaner = data_cleaner
self.servies_remainer = servies_remainer
self.normalization = normalization
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, data_df):
# we first copy the data from our dataset.
# operate on original data set sometimes is dangerous.
X = data_df.copy()
#0. drop NaN values
# drop vote and image
X = X.drop(['vote','image'], axis = 1)
# drop NaN values
for i in range(len(X.columns)):
X = X.drop(X[X[str(X.columns[i])].isna()].index)
# 1. First we change the feature verified with to integer
X["verified"] = X["verified"].astype(int)
# 2. purge outliers
X = purge_outliers(X)
# 3. drop all useless features and categorical features we alreayd transfered
X = X.drop(['reviewerID','reviewTime', 'asin', 'unixReviewTime'],axis=1)
# 4. delete HTML tag and other useless characters
X = clean_useless_information(X)
# 5. clean alphanumeric data
X['style'] = X['style'].str.replace('Format', '')
# get text feature
feature = X.select_dtypes(exclude="number").columns
for i in range(len(feature)):
print("Now it's removing number and alphanumberic from ", feature[i])
# remove stop words
# first change all character to lower case
X[feature[i]] = X[feature[i]].str.replace('[^\w\s]+', '')
X[feature[i]] = X[feature[i]].str.replace('[0-9]+', '')
# remove stop words
stop_words = stopwords.words('english')
for i in range(len(feature)):
print("Now it's removing stop words from ", feature[i])
# remove stop words
# first change all character to lower case
X[feature[i]] = X[feature[i]].str.lower()
X[feature[i]] = X[feature[i]].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
# create new column
X['text'] = X['summary'] + " " + X['reviewText']
#6. clean style's space
X['style'] = X['style'].str.replace(' ', '')
# we put our target value at the end
target = X.pop('overall')
X['score'] = target
return X
#############################PIPE LINE###################################################
# Now we build a transformer to get all the above steps
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# convert_pipeline is for create a whole pipeline but remain the dataFrame structure
convert_pipeline = Pipeline([
('attribs_adder_cleaner', combined_attribute_adder_and_cleaner(data_cleaner=True)),
])
By newest classification from our TA. We can use only two datasets for task 4.
Here, I choose to compare dataset, 1 and 3.
Such that, Q3's result: as dataset 1: preprocessed. And dataset 3, POS tagged after preprocessing.
Get our already preprocessed data from before.
lemmatized_text_data = pd.read_csv('/content/drive/MyDrive/A3/lemmatized_text_data.csv')
lemmatized_text_data = lemmatized_text_data.drop('Unnamed: 0', axis=1) # old index is useless now. drop it
lemmatized_text_data = lemmatized_text_data.convert_dtypes()
lemmatized_text_data = lemmatized_text_data.fillna('empty')
lemmatized_text_data.info()
Separate the 880K dataset into 100K dataset.
from sklearn.model_selection import train_test_split
model_data_raw, _ = train_test_split(lemmatized_text_data, test_size=0.88695, random_state=42)
# we drop reviewText and reivewName, summary and text columns, since we already have token_text and review Name is useless.
model_data_raw = model_data_raw.drop(['reviewerName', 'reviewText', 'summary'], axis = 1)
# rearange the target label feature score to the end
model_data_raw['score'] = model_data_raw.pop('score')
model_data_raw.info()
model_data_raw.head()
Load Part-of_speech packages
from nltk import word_tokenize, pos_tag, pos_tag_sents
Get all text from the column text
texts = model_data_raw['text'].tolist()
texts
Map our text to word tonkenizer
map(word_tokenize, texts)
Create a new column called POS_text.
We will save our result into this column.
model_data_raw['POS_text'] = pos_tag_sents( model_data_raw['text'].apply(word_tokenize).tolist())
Now, we delete all useless columns we aleady know from Q3.
model_data_raw = model_data_raw.drop(['style', 'token_text'],axis=1)
model_data_raw.head()
model_data_raw.to_csv('Pos_dataset.csv')
joblib.dump(model_data_raw, 'pos_data_raw.pkl')
Now it's time to extract the nouns only words to obtain a bag-of-words tf-idf weighted vector.
First, we create a new column called "only_noun". We will get all NN type of words from the column POS_text and save those into our new column.
# reload the dataset for more easy to access
model_data_raw = pd.read_csv('/content/drive/MyDrive/A3/Pos_dataset.csv')
model_data_raw = model_data_raw.drop('Unnamed: 0', axis=1) # old index is useless now. drop it
model_data_raw = model_data_raw.convert_dtypes()
model_data_raw = model_data_raw.fillna('empty')
model_data_raw.info()
# create a new column called only_noun with empty values
model_data_raw['only_noun'] = pd.NA
model_data_raw.head()
Note: We know that feature POS_text has index 3 and only_noun has index 4. We will use this information for the next step operation.
Now, we write a for loop to iterate all instances and get the Noun by rows then save it to only_noun feature.
# use for loop to get all pos_list
from tqdm.notebook import tqdm
for i in tqdm(range(len(model_data_raw))):
# get one instances from POS_text by iloc with i, 3
text_list = model_data_raw.iloc[i, 3 ]
# get a empty set for saving noun words
nouns = []
# iterate over the text_list
for words, wtype in text_list:
if wtype == 'NN': # if it's a noun
nouns.append(words) # add words to our noun array
# end for
# we save our nouns extraction to our new column
# first we convert our list to string then save the string back to the dataset.
str1 = " "
nouns_str = str1.join(nouns)
model_data_raw.iloc[i, 4 ] = nouns_str
Save our results to a new dataframe
data_only_noun = model_data_raw.copy()
data_only_noun.to_csv('data_only_noun_data.csv')
We reset our new dataset's index.
data_only_noun = data_only_noun.reset_index()
data_only_noun = data_only_noun.drop('index',axis=1)
Print the new dataframe's head
data_only_noun.head()
Now we can see in only_noun column, there indeed only nouns.
It's time to get the TF-IDF matrix
#----------------TDIDF_Data_generator---------------
def TDIDF_Data_generator_pos(data, max_features = 500, feature_name='only_noun'):
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# we set for max_features: as most words we saved for 500 words
# in order to not exceed our memory
v_test = TfidfVectorizer(stop_words='english', max_features = max_features)
# get TDIDF array from token_text
x_token_text = v_test.fit_transform(data[feature_name])
# save it into a pandas dataframe
tdidf_data = pd.DataFrame(x_token_text.toarray(), columns = v_test.get_feature_names() )
data_copy = data.copy()
data_copy = data_copy.reset_index() # need to reset index for matching the results
tdidf_data['verified'] = data_copy['verified']
tdidf_data['score'] = data_copy['score']
return tdidf_data
data_only_noun = pd.read_csv('/content/drive/MyDrive/A3/data_only_noun_data.csv')
data_only_noun = data_only_noun.drop('Unnamed: 0', axis=1) # old index is useless now. drop it
data_only_noun = data_only_noun.convert_dtypes()
data_only_noun = data_only_noun.fillna('0')
data_only_noun.info()
only_noun_tdidf_data = TDIDF_Data_generator_pos(data_only_noun)
only_noun_tdidf_data.head()
only_noun_tdidf_data.info()
Firse we get our training, validation, and test set
X_raw, y_raw = get_model_set(only_noun_tdidf_data)
X_train,X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.2, random_state=42, stratify=y_raw)
X_train,X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
Then we reuse our feature selection function from Q3.
Save feature names.
# save feature names
features_name = X_train.loc[:,:].columns.tolist();
len(features_name)
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
fscores = select_features_prompt(X_train, y_train,X_test, f_classif)
Draw words importances by our feature selection method.
By frist 150 words.
results_df = pd.DataFrame(fscores, index=features_name , columns = ['importance'])
results_df = results_df.sort_values(by=['importance'],ascending=False)
words_importance_plot(results_df.head(100),fig_size=(15,18))
Now, we draw the boxplot.
words_importance_barplot(results_df)
build_continuous_features_report(results_df)
The mean importances is 14.97, and max is 944.
We would like to select the high importance words.
We can see that in our POS tagged dataset. There are less important words.
Let's run our find outlier function to find the most important words.
find_outliers(results_df,'importance')
Let's redraw the plot with any importance value larger than 20. By our find outliers function we defined before.
words_importance_barplot(results_df[results_df['importance'] > 20])
Let's see how many of those words that importance > 20.
len(results_df[results_df['importance'] > 20])
53 is a little bit less than we have in Q3, as 67. Let's see how good it works.
Now, we trim our dataset into this 57 features and our target label, scores.
reduced_only_noun_tdidf_data = only_noun_tdidf_data.loc[:, results_df[results_df['importance'] > 20].index]
reduced_only_noun_tdidf_data['score'] = only_noun_tdidf_data['score']
reduced_only_noun_tdidf_data.head(5)
reduced_only_noun_tdidf_data.to_csv('reduced_only_noun_tdidf_data.csv')
That will be the dataset, we will use in train and evaluate stage.
We use the same evaluation metric as in Q3. Accuarcy.
There is no need to change metrics in this stage.
First, we get our training dataset from our reduced feature dataset by our feature selection method.
reduced_only_noun_tdidf_data = pd.read_csv('/content/drive/MyDrive/A3/reduced_only_noun_tdidf_data.csv')
reduced_only_noun_tdidf_data = reduced_only_noun_tdidf_data.drop('Unnamed: 0', axis=1) # old index is useless now. drop it
reduced_only_noun_tdidf_data = reduced_only_noun_tdidf_data.convert_dtypes()
reduced_only_noun_tdidf_data = reduced_only_noun_tdidf_data.fillna('0')
# reduced_only_noun_tdidf_data.info()
X_raw, y_raw = get_model_set(reduced_only_noun_tdidf_data)
X_train,X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.2, random_state=42, stratify=y_raw)
X_train,X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
Test train.
from sklearn.ensemble import RandomForestClassifier
forest_cls = RandomForestClassifier(n_estimators = 50, random_state=42,verbose=1)
forest_cls.fit(X_train, y_train)
from sklearn.metrics import f1_score
churn_prediction = forest_cls.predict(X_valid)
from sklearn.metrics import classification_report
target_names = ['1', '2','3','4','5']
print("Classification report of the first classifier:\n\n",
classification_report(y_valid, churn_prediction, target_names=target_names))
Not too surprised. Our accuracy reduced a lot.
The idea of choosing only nouns is not a good one.
People use adjectives to express their feeling of good or bad.
If we only select nouns to predict the scores, we will lack of the biggest benchmark of adjectives. And there will be not too many things left.
Such a bad idea.
But, still we can do hyperparamter for better performance.
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distribs = {
'n_estimators': randint(low=1, high = 100),
'max_features': randint(low=1, high = 8),
}
forest_cls_rs = RandomForestClassifier(random_state=42, verbose = 1)
rnd_search = RandomizedSearchCV(forest_cls_rs, param_distributions=param_distribs,
n_iter=10, cv=5, random_state=42)
rnd_search.fit(X_train, y_train)
Best estimator:
rnd_search.best_estimator_
Print the list of we tried and sorted the order
cvres_rnd = rnd_search.cv_results_
for mean_score, params in sorted(zip(cvres_rnd["mean_test_score"], cvres_rnd["params"]), reverse=True):
print(mean_score, params)
We can see that our best estimator is using RSV: bootstrap with 3 random features among all trees and use totally 88 trees.
We will use the same training set and etc in this stage.
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Train our model with the best estimators
forest_cls_final = RandomForestClassifier(max_features=3, n_estimators=88, random_state=42,
verbose=1)
# train our model
forest_cls_final.fit(X_train, y_train)
# testing the result on validation set
pred_final_valid = forest_cls_final.predict(X_valid)
target_names = ['1', '2','3','4','5']
print("Classification report of the final classifier on validation set:\n\n",
classification_report(y_valid, pred_final_valid, target_names=target_names))
We can see that our result are actually worse. We do cross validation again on training set.
We will use this result to compare with the one from Q3.
from sklearn.model_selection import cross_val_score
only_NN_results = cross_val_score(forest_cls_final, X_train, y_train, cv=10)
only_NN_results
We will use the result later in section 4.3
Test it on test set.
# test it on test set
prediction_final_test = forest_cls_final.predict(X_test)
print("Classification report of the final classifier on validation set:\n\n",
classification_report(y_test, prediction_final_test, target_names=target_names))
The accuracy on test set is 0.57.
The procedure is same as Q3. Since we didn't change our model. And We can draw the learning curve and validation curve of our model to see whether the training score and validation score has departed.
For learning curve:
Since we are using the accuracy evaluation metrics, hence, if the valiation score become very larger and training score still small, then we know our model is overfitted.
We can based on our results to choose our train set size or number of trees in our model.
For validation curve: Although randomforest algroithm is not very easy to overfitted. We still can tune the hyperparameters to make sure it will not overfit. By looking into the validation curve, if along with the grow of number of trees, the train/loss curve has departed from each other. We know that our model become overfitted at that number of trees.
Use these two tools, we can make sure our model will not overfit our data.
A learning curve is a graphical representation of the relationship between how proficient people are at a task and the amount of experience they have. Proficiency (measured on the vertical axis) usually increases with increased experience (the horizontal axis), that is to say, the more someone, groups, companies or industries perform a task, the better their performance at the task.Wikipedia
We can find out from the LR curve that whether our model is overfitting/underfitting or whether our model could still imporve by more training epoches. It will help identify our model is good or bad.
set the model as our best estimator and finding the learning curve with scoring = accuracy. Sklearn document
from sklearn.model_selection import learning_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import validation_curve
# set the model as our best estimator and finding the learning curve with scoring = accuracy
N, train_lc, val_lc = learning_curve(RandomForestClassifier(max_features=3, n_estimators=88, random_state=42),
X_train, y_train, cv=5,scoring='accuracy',
train_sizes=np.linspace(0.05, 1, 20)) # separate our training size by np.linspace, to 20 pieces
# learning curve
learning_curve(N, train_lc, val_lc)
Although we used random search and grid search for finding the most proper hyperparameters, we still want to know that how many trees is good enough and how our model is influnenced by the different number of trees. Like what we did in polynomial regression with different degrees.
In this kind of visualization, we can give a intuitive way of showing the chaning of train/valid score by trees number.
This can help up to use a relatively good enough number of trees when we test our model on different dataset. Since if we best estimator only has 22 trees, we set our limit to 50 trees in this plot.
from sklearn.model_selection import validation_curve
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import numpy as np
n_estimators = np.arange(1, 50) # limit of number of estimators
train_score, val_score = validation_curve(RandomForestClassifier(max_features=3, n_estimators=88, random_state=42), X_train, y_train,
param_name='n_estimators', param_range=n_estimators, cv=2
, scoring = 'accuracy')
# draw the validation curve
valid_score_curve(train_score, val_score, n_estimators = np.arange(1, 50))
We will reuse the function we defined before in Q3.
ROC Curve
# draw ROC curve
draw_roc_or_percision_recall_curve(forest_cls_final, y_test, X_test, type='roc')
Percision Recall curve
# draw percision recall curve
draw_roc_or_percision_recall_curve(forest_cls_final, y_test, X_test, type='pr_c')
First, we analyze the learning curve.
# learning curve
learning_curve(N, train_lc, val_lc)
We can see that the validation score is not improve by the increasing of training size. Since our dataset is easy to fit, but not easy to separate the prediction between 5 scores.
print("Classification report of the final classifier on validation set:\n\n",
classification_report(y_test, prediction_final_test, target_names=target_names))
If we take a look at the classification score again, we would find that the most low scores are from predicted score 1,2,3,4.
Even if we used the stratified sampling on the original dataset. The thing is scores from 1 to 4 are not predicting by two reasons. Not too many people gave those scores more often. We saw in data understanding part, people more likely to give high scores on product. That make things hard to predict by only their words.
# draw the validation curve
valid_score_curve(train_score, val_score, n_estimators = np.arange(1, 50))
And the validation curve shows the same pattern with Q3's model.
We will not spend too much time on this. Because we already analyze the result from Q3.
The model start to get stable after 5 or 6 trees. That means the model is very easy to fit the dataset. But hard to improve. We probably need more words in it. Like adjectives!
We resue the function we defined before.
# draw ROC curve
draw_roc_or_percision_recall_curve(forest_cls_final, y_test, X_test, type='roc')
We can see that class 1 still has the lowest value of AUC. But this time, the more classes' AUC curve are dropping or has lower value than we have from the original dataset.
We can see that only nouns indeed has worse performance.
print("Classification report of the final classifier on validation set:\n\n",
classification_report(y_test, prediction_final_test, target_names=target_names))
We use our previous defined function to draw this graph.
# draw ROC curve
draw_roc_or_percision_recall_curve(forest_cls_final, y_test, X_test, type='pr_c')
This time, we can see class 5 has lower precision than in the original dataset. And other 4 classes have the similar performance with the orignial dataset.
Now we create a new dataFrame to save the original result and our only nouns results.
We take another look at the original results we get from Q3. Since the random seed are the same. The training data set is exactly the same between our two test.
Hence, there is no need to retrain our model from Q3.
First we retreive our original result from Q3.
original_result
Then retreive our only nouns results
only_NN_results
Save two results into a new dataframe
result = pd.DataFrame()
result['original'] = pd.DataFrame(original_result)
result['only_nn'] = pd.DataFrame(only_NN_results)
result
Draw the boxplot.
figure(figsize=(8, 6), dpi=100)
ax = sns.boxplot(data=result)
plt.ylabel('Accuracy')
plt.title('Boxplot of Accuracy between original set and only nouns set')
We can see the only nouns dataset is much worse than the original dataset.
But we need the significance test to prove our hypothesis.
First, let's what our results' distributions look like. We would like to find our whether they are normal distributions.
result.hist()
From the above image, we can clearly see that they are not normal distribution.
Hence We can not use t-test. For nonparameteric distribution, we use Kolmogorov-Smirnov statistic test.
from scipy.stats import ks_2samp
#---------------ks_2samp_test------------------
def ks_2samp_test(data, param1='randomForest', param2='NeuralNetwork'):
max_print_out(True)
# get the siginificance test results
value, pvalue = ks_2samp(data[param1].values,data[param2].values)
print(value, pvalue)
# if the pvalue larget than 0.05
if pvalue > 0.05:
print('Samples are likely drawn from the same distributions (fail to reject H0)')
else:
print(' Samples are likely drawn from different distributions (reject H0)')
ks_2samp_test(result, 'original', 'only_nn')
Now, we can see that those two results are from two different distribution. Since our H0 is rejected.
We take another look at the boxplot again.
figure(figsize=(8, 6), dpi=100)
result.boxplot()
plt.ylabel('Accuracy')
plt.title('Boxplot of Accuracy between original set and only_nn set')
Now, we could say that the original dataset testing accuaracy is much better than the only nouns dataset's result.
The reason is very clearly actually.
When a person would rate some product and write his/her review.
He will not use nouns to express his feeling about good or bad.
That's simply impossible.
People often use adjective to express their feeling.
Like GOOD, GREAT, BAD, Remarkable, etc.
When we only extract nouns, it's basically delete all the useful information to predict the score.
I am superised we can get 61% accuaracy.
It's really worth to try to save all adjective words, instead of nouns.
Now we create a new dataset with extracting adjectives words.
Which is "JJ" in our case.
Now we get our dataset with only adjectives.
First we retrive our dataset from pkl
# first we retrive this dataset
model_data_raw = joblib.load('/content/drive/MyDrive/A3/pkls/pos_data_raw.pkl')
only_adj_data = model_data_raw.copy()
only_adj_data = only_adj_data.reset_index()
only_adj_data = only_adj_data.drop('index',axis = 1)
# create a new column called only_noun with empty values
only_adj_data['only_adj'] = pd.NA
only_adj_data.head()
Now we get all words are belong to adjectives.
# use for loop to get all pos_list
from tqdm.notebook import tqdm
for i in tqdm(range(len(only_adj_data))):
# get one instances from POS_text by iloc with i, 3
text_list = only_adj_data.iloc[i, 3]
# get a empty set for saving adj words
adj = []
# iterate over the text_list
for words, wtype in text_list:
if wtype == 'JJ': # if it's a noun
adj.append(words) # add words to our noun array
# end for
# we save our nouns extraction to our new column
# first we convert our list to string then save the string back to the dataset.
str1 = " "
adj_str = str1.join(adj)
only_adj_data.iloc[i, 4 ] = adj_str
only_adj_data['only_adj'] = only_adj_data.pop('only_noun')
Now, we print the new dataset's head
only_adj_data.head()
joblib.dump(only_adj_data,'only_adj_data.pkl')
We can see that there is only adjectives left now.
only_adj_tdidf_data = TDIDF_Data_generator_pos(only_adj_data, max_features= 500,feature_name = 'only_adj')
Get our training, validation and testing dataset.
only_adj_tdidf_data.head()
X_raw, y_raw = get_model_set(only_adj_tdidf_data)
X_train,X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.2, random_state=42, stratify=y_raw)
X_train,X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
Pretrain our model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Train our model
forest_cls_final = RandomForestClassifier(max_features=3, n_estimators = 72, random_state=42,verbose=3)
# train our model
forest_cls_final.fit(X_train, y_train)
pred_final_valid = forest_cls_final.predict(X_valid)
target_names = ['1', '2','3','4','5']
print("Classification report of the final classifier on validation set:\n\n",
classification_report(y_valid, pred_final_valid, target_names=target_names))
Get our cross validation score on training set
from sklearn.model_selection import cross_val_score
only_adj_results = cross_val_score(forest_cls_final, X_train, y_train, cv=10)
only_adj_results
Save to our result dataFrame
result['only_adj'] = pd.DataFrame(only_adj_results)
Draw the boxplot
figure(figsize=(8, 6), dpi=100)
result.boxplot()
plt.ylabel('Accuracy')
plt.title('Boxplot of Accuracy between original set, only noun and only adjectives set')
We can see that our dataset with only adjectives are much better than the other two. It has higher average scores and small range of minimun and maximum.
ks_2samp_test(result, 'original', 'only_adj')
Although we can't pass the significance test. But if we have more data, I would say only adjective words are our best choice. Since, we use less data, but we got a very similar results with the original data. And we have higher minimun accuracy value than the original data's result.
It means our adjectives model is more stable.
Stack Overflow. (n.d.). Removing html tags in pandas. https://stackoverflow.com/questions/45999415/removing-html-tags-in-pandas
Stack Overflow. (n.d.). Python remove stop words from pandas dataframe. https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
MasterClass staff. (2022, February 25). Mass Market Paperbacks: 5 Parts of a Mass Market Paperback. MasterClass. https://www.masterclass.com/articles/mass-market-paperback
Spark by Examples. (n.d.). Pandas Combine Two Columns of Text in DataFrame. https://sparkbyexamples.com/pandas/pandas-combine-two-columns-of-text-in-dataframe/
Python Software Foundation. (n.d.). Date time - Basic date and time types. Python. https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
Hmghaly. (2013, October 7). Sklearn plot confusion matrix with labels. Stackoverflow. https://stackoverflow.com/questions/19233771/sklearn-plot-confusion-matrix-with-labels
RDocumentation. (n.d.). Prop.table: Express Table Entries as Fraction of Marginal Table. RDocumentation. https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/prop.table
Learning Curve. (n.d.). In Wikipedia. Retrieved July 3, 2022, from https://en.wikipedia.org/wiki/Learning_curve
Scikit learn. (n.d.). Sklearn.svm.SVC. Scikit learn. https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
Early stopping. (n.d.). In Wikipedia. Retrieved July 3, 2022, from https://en.wikipedia.org/wiki/Early_stopping
Kassambara, A. (2018). Machine Learning Essentials: Practical Guide in R. CreateSpace Independent Publishing Platform. https://www.datanovia.com/en/product/machine-learning-essentials-practical-guide-in-r/?url=/5-bookadvisor/54-machine-learning-essentials/
Learning Curve. (n.d.). In Wikipedia. Retrieved June 22, 2022, from https://en.wikipedia.org/wiki/Learning_curve
Early stopping. (n.d.). In Wikipedia. Retrieved June 22, 2022, from https://en.wikipedia.org/wiki/Early_stopping
Géron, A. (2019). Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition. O'Reilly Media, Inc.